library(factoextra)
package ‘factoextra’ was built under R version 3.6.2Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#path to protected dbGap data location
local_path <- "~/Documents/MOMS-PI"# change me
#local_path <- "~/MOMS-PI"# change me
path2 <- file.path(local_path,"71694", "PhenoGenotypeFiles", "RootStudyConsentSet_phs001523.MOMS_PI.v1.p1.c1.DS-PREG-COM-IRB-PUB-MDS", "PhenotypeFiles") 
otu <- read.csv(file = file.path(path2, "MergedFiles", "OTU_table.csv"))
rownames(otu) <- otu$X
otu$X <- NULL 
t_otu <- as.data.frame(t(otu))
meta <- read.csv(file = file.path(path2, "MergedFiles", "Meta_data.csv"))
#physical activity variable
phys <- meta[,98:100]
meta$phys <- ifelse(meta$physical_activity_vigorous == "0_times" | is.na(meta$physical_activity_vigorous), "None", "Vigorous")
meta$phys[which(meta$phys == "None" & meta$physical_activity_moderate != "0_times" & !is.na(meta$physical_activity_moderate))] <- "Moderate"
meta$phys[which(meta$phys == "None" & meta$physical_activity_light != "0_times" & !is.na(meta$physical_activity_light))] <- "Light"
meta$phys <- as.factor(meta$phys)
#filter to just two races
meta <- meta %>%
  filter(race == "african_american" | race == "caucasian")
meta$race <- factor(meta$race)
rownames(meta) <- meta$X
meta$X <- NULL
t_otu <- t_otu[rownames(meta),]
meta <- meta %>%
  mutate(antibiotics_current = replace_na(antibiotics_current, "No"))
meta <- meta %>%
  mutate(infertility_treatment = replace_na(infertility_treatment, "No")) %>%
  mutate(infertility_treatment = recode(infertility_treatment, Not_sure = "No", .default = levels(meta$infertility_treatment)))
meta <- meta %>%
  mutate(vdischarge = replace_na(vdischarge, "No")) %>%
  mutate(vdischarge = recode(vdischarge, Not_Sure = "No", .default = levels(meta$vdischarge)))
meta <- meta %>%
  mutate(vodor = replace_na(vodor, "No")) %>%
  mutate(vodor = recode(vodor, Not_Sure = "No", .default = levels(meta$vodor)))
meta <- meta %>%
  mutate(vitching = replace_na(vitching, "No"))
meta <- meta %>%
  mutate(douche = replace_na(douche, "More_than_1_year_ago_or_never"))
meta <- meta %>%
  mutate(bv_history = replace_na(bv_history, "No")) %>%
  mutate(bv_history = recode(bv_history, Not_Sure = "No", .default = levels(meta$bv_history)))
meta <- meta %>%
  mutate(vaginitis_history = replace_na(vaginitis_history, "No"))
meta <- meta %>%
  mutate(current_medications = replace_na(current_medications, "No")) %>%
  mutate(current_medications = recode(current_medications, No_I_am_not_taking_any_new_medications = "No", .default = levels(meta$current_medications)))
meta <- meta %>%
  mutate(smoker_current = replace_na(smoker_current, "No"))
meta <- meta %>%
  mutate(alcohol_frequency_year = replace_na(alcohol_frequency_year, "0_times"))
meta <- meta %>%
  mutate(diabetes = replace_na(diabetes, "No")) %>%
  mutate(diabetes = recode(diabetes, Not_sure = "No", .default = levels(meta$diabetes)))
meta <- meta %>%
  mutate(no_false_labor = replace_na(no_false_labor, "Yes")) %>%
  mutate(no_false_labor = recode(no_false_labor, No_I_have_not_experienced_false_labor = "Yes", .default = levels(meta$no_false_labor)))
meta <- meta %>%
  mutate(no_high_bp = replace_na(no_high_bp, "Yes"))
meta <- meta %>%
  mutate(no_vbleeding = replace_na(no_vbleeding, "Yes"))
meta <- meta %>%
  mutate(progesterone = replace_na(progesterone, "No")) %>%
  mutate(progesterone = recode(progesterone, Not_Sure = "No", .default = levels(meta$progesterone)))
levels(meta$age)[1] <- NA
#Create combined table
full_set <- cbind(t_otu, meta)
full_set$alpha_div <- diversity(full_set[,1:length(names(t_otu))], index="shannon")
full_set$num_reads <- rowSums(t_otu)
#Add vagitype
mypropdata <- full_set[,colnames(t_otu)]
mytypes <- apply(mypropdata,1,which.max)
maxprop <- as.numeric(mypropdata[matrix(c(1:nrow(mypropdata),mytypes), ncol=2)])
mytypes <- colnames(mypropdata)[mytypes]
mytypes[maxprop < 0.3] <- "No Type"
mytypes <- as.factor(mytypes)
full_set <- full_set %>%
  mutate(vagitype=mytypes)
high_vagitype <- sort(table(full_set$vagitype), decreasing=T)[1:5]
high_vagitype

            Lactobacillus_iners Lactobacillus_crispatus_cluster           Lachnospiraceae_BVAB1 
                            714                             396                             228 
          Gardnerella_vaginalis   Lactobacillus_gasseri_cluster 
                            128                              64 
#Add proportions
full_set$p_Iners <- full_set$Lactobacillus_iners/full_set$num_reads
full_set$p_Crisp <- full_set$Lactobacillus_crispatus_cluster/full_set$num_reads
full_set$p_Gard <- full_set$Gardnerella_vaginalis/full_set$num_reads
full_set$Gass <- full_set$Lactobacillus_gasseri_cluster/full_set$num_reads
full_set$p_BVAB <- full_set$Lachnospiraceae_BVAB1/full_set$num_reads
get_mode = function(x, multimodal.na="TRUE"){
  modes <- Mode(x)
  if (multimodal.na=="FALSE" | length(modes)==1) {
    return(modes[1]) 
  } else {
    return(modes[length(modes)+1])
  }
}
#582 subjects
#aggregate at patient level
demog <- aggregate(visit_number ~ SUBJECT_ID, data = meta%>%data.frame, FUN = min)
demog <- merge(demog,meta%>%data.frame, by = c("SUBJECT_ID", "visit_number") )
  
#number visits per subject 
df <- aggregate(visit_number ~ SUBJECT_ID, data = meta%>%data.frame, FUN = length)
colnames(df)[which(colnames(df) == "visit_number")] <- "number of visits"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average income per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(income = get_mode(income, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "income")] <- "average income"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average education per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(education = get_mode(education, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "education")] <- "average education"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average age per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(age = get_mode(age, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "age")] <- "average age"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average prenatal care start per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(prenatal_care_start = get_mode(prenatal_care_start, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "prenatal_care_start")] <- "average p_care_start"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average infertility treatment per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(infertility_treatment = get_mode(infertility_treatment, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "infertility_treatment")] <- "average inf_trt"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average v_discharge per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(vdischarge = get_mode(vdischarge, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "vdischarge")] <- "average v_discharge"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average v_odor per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(vodor = get_mode(vodor, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "vodor")] <- "average v_odor"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average v_itching per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(vitching = get_mode(vitching, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "vitching")] <- "average v_itching"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average douche per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(douche = get_mode(douche, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "douche")] <- "average douche"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average bv_history per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(bv_history = get_mode(bv_history, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "bv_history")] <- "average bv_history"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average vaginitis_history per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(vaginitis_history = get_mode(vaginitis_history, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "vaginitis_history")] <- "average vaginitis_history"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average current_medications per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(current_medications = get_mode(current_medications, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "current_medications")] <- "average current_medications"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average smoker_current per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(smoker_current = get_mode(smoker_current, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "smoker_current")] <- "average smoker_current"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average alcohol_frequency per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(alcohol_frequency_year = get_mode(alcohol_frequency_year, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "alcohol_frequency_year")] <- "average alcohol_frequency"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average diabetes per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(diabetes = get_mode(diabetes, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "diabetes")] <- "average diabetes"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average false_labor per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(no_false_labor = get_mode(no_false_labor, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "no_false_labor")] <- "average no_false_labor"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average high_bp per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(no_high_bp = get_mode(no_high_bp, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "no_high_bp")] <- "average no_high_bp"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average v_bleeding per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(no_vbleeding = get_mode(no_vbleeding, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "no_vbleeding")] <- "average no_vbleeding"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average progesterone per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(progesterone = get_mode(progesterone, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "progesterone")] <- "average progesterone"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#average vaginal_ph across visits
#Some patients have na so goes to 568
df <- aggregate(vaginal_pH ~ SUBJECT_ID, data = meta%>%data.frame, FUN = mean)
colnames(df)[which(colnames(df) == "vaginal_pH")] <- "mean_vaginal_ph"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#alpha diversity
df <- aggregate(alpha_div ~ SUBJECT_ID, data = full_set%>%data.frame, FUN = mean)
colnames(df)[which(colnames(df) == "alpha_div")] <- "mean_alpha_div"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#reported_ga is only for minimum recorded visit for each individual
Varnames <- c("mean_vaginal_ph", "number of visits", "reported_ga", "mean_alpha_div", "average income", 
              "average education", "average age", "average p_care_start", "average inf_trt", 
              "average  v_discharge", "average v_odor", "average v_itching", "average douche", 
              "average bv_history", "average vaginitis_history", "average current_medications",
              "average smoker_current", "average alcohol_frequency", "average diabetes", 
              "average no_false_labor", "average no_high_bp", "average no_vbleeding", "average progesterone")
#racial breakdown, break down visits and other demographics
stratified1 = tableone::CreateTableOne(
  vars = Varnames[1:4],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
stratified2 = tableone::CreateTableOne(
  vars = Varnames[5:7],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
stratified3 = tableone::CreateTableOne(
  vars = Varnames[8:15],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
The data frame does not have: average  v_discharge  Dropped
stratified4 = tableone::CreateTableOne(
  vars = Varnames[16:19],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
stratified5 = tableone::CreateTableOne(
  vars = Varnames[20:23],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified1
                              Stratified by race
                               african_american caucasian    p      test
  n                             366              152                    
  mean_vaginal_ph (mean (SD))  4.85 (0.80)      4.74 (0.72)   0.130     
  number of visits (mean (SD)) 3.57 (1.72)      3.36 (1.49)   0.196     
  reported_ga (%)                                             0.032     
     First Trimester             86 (23.5)        48 (31.6)             
     Second Trimester           179 (48.9)        76 (50.0)             
     Third Trimester             98 (26.8)        25 (16.4)             
     NA                           3 ( 0.8)         3 ( 2.0)             
  mean_alpha_div (mean (SD))   1.12 (0.59)      0.91 (0.65)  <0.001     
#fix the part where every race is shown, probably due to meta being called
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified2
                                    Stratified by race
                                     african_american caucasian   p      test
  n                                  366              152                    
  average income (%)                                              <0.001     
     15_000_19_999                    33 ( 9.0)         9 ( 5.9)             
     20_000_39_999                    40 (10.9)        20 (13.2)             
     40_000_59_999                     8 ( 2.2)        30 (19.7)             
     60_000_79_999                     3 ( 0.8)        25 (16.4)             
     80_000_or_more                    1 ( 0.3)        37 (24.3)             
     Under_15_000                    267 (73.0)        27 (17.8)             
     NA                               14 ( 3.8)         4 ( 2.6)             
  average education (%)                                           <0.001     
     2_year_College_Degree            29 ( 7.9)        16 (10.5)             
     4_year_College_Degree             7 ( 1.9)        47 (30.9)             
     Doctoral_or_Professional_Degree   1 ( 0.3)        18 (11.8)             
     High_School_GED                 182 (49.7)        27 (17.8)             
     Less_than_High_School            50 (13.7)         1 ( 0.7)             
     Masters_Degree                    2 ( 0.5)        19 (12.5)             
     Some_College                     91 (24.9)        24 (15.8)             
     NA                                4 ( 1.1)         0 ( 0.0)             
  average age (%)                                                 <0.001     
                                       0 ( 0.0)         1 ( 0.7)             
     18                                7 ( 1.9)         1 ( 0.7)             
     18_to_28                        239 (65.3)        49 (32.2)             
     29-38                            97 (26.5)        91 (59.9)             
     Above_38                         15 ( 4.1)         9 ( 5.9)             
     Below_18                          6 ( 1.6)         1 ( 0.7)             
     NA                                2 ( 0.5)         0 ( 0.0)             
#fix the part where every race is shown, probably due to meta being called
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified3
                                                          Stratified by race
                                                           african_american caucasian   p      test
  n                                                        366              152                    
  average p_care_start (%)                                                              <0.001     
     0_4_weeks_after_conception                            101 (27.6)        33 (21.7)             
     4_weeks_to_the_end_of_the_first_trimester             157 (42.9)        88 (57.9)             
     Before_conception                                       9 ( 2.5)        11 ( 7.2)             
     During_the_second_trimester                            51 (13.9)        14 ( 9.2)             
     During_the_third_trimester                              7 ( 1.9)         2 ( 1.3)             
     I_did_not_receive_prenatal_care                        15 ( 4.1)         0 ( 0.0)             
     I_have_not_yet_received_prenatal_care_but_planning_to  18 ( 4.9)         1 ( 0.7)             
     NA                                                      8 ( 2.2)         3 ( 2.0)             
  average inf_trt = Yes (%)                                  5 ( 1.4)        10 ( 6.6)   0.003     
  average v_odor = Yes (%)                                  30 ( 8.2)         4 ( 2.6)   0.033     
  average v_itching = Yes (%)                               23 ( 6.3)         5 ( 3.3)   0.246     
  average douche (%)                                                                    <0.001     
     More_than_1_month_ago                                  84 (23.0)         8 ( 5.3)             
     More_than_1_year_ago_or_never                         279 (76.2)       144 (94.7)             
     Within_the_last_24_hours                                3 ( 0.8)         0 ( 0.0)             
  average bv_history (%)                                                                 0.494     
     No                                                    286 (78.1)       125 (82.2)             
     Yes                                                    79 (21.6)        27 (17.8)             
     NA                                                      1 ( 0.3)         0 ( 0.0)             
  average vaginitis_history = Yes (%)                       13 ( 3.6)         2 ( 1.3)   0.274     
#fix the part where every race is shown, probably due to meta being called
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified4
                                       Stratified by race
                                        african_american caucasian   p      test
  n                                     366              152                    
  average current_medications = Yes (%) 141 (38.5)        99 (65.1)  <0.001     
  average smoker_current = Yes (%)       54 (14.8)        21 (13.8)   0.889     
  average alcohol_frequency (%)                                      <0.001     
     0_times                            223 (60.9)        36 (23.7)             
     1_2_times                           54 (14.8)        28 (18.4)             
     10_19_times                         12 ( 3.3)        20 (13.2)             
     20_times                            13 ( 3.6)        37 (24.3)             
     3_5_times                           42 (11.5)        19 (12.5)             
     6_9_times                           21 ( 5.7)        12 ( 7.9)             
     NA                                   1 ( 0.3)         0 ( 0.0)             
  average diabetes (%)                                                0.529     
     No                                 356 (97.3)       145 (95.4)             
     Yes                                  9 ( 2.5)         6 ( 3.9)             
     NA                                   1 ( 0.3)         1 ( 0.7)             
#fix the part where every race is shown, probably due to meta being called
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified5
                            Stratified by race
                             african_american caucasian   p      test
  n                          366              152                    
  average no_false_labor (%)                               NaN       
                               0 ( 0.0)         0 ( 0.0)             
     No                       37 (10.1)         2 ( 1.3)             
     Yes                     321 (87.7)       148 (97.4)             
     NA                        8 ( 2.2)         2 ( 1.3)             
  average no_high_bp (%)                                   NaN       
                               0 ( 0.0)         0 ( 0.0)             
     No                       42 (11.5)         9 ( 5.9)             
     Yes                     312 (85.2)       140 (92.1)             
     NA                       12 ( 3.3)         3 ( 2.0)             
  average no_vbleeding (%)                                 0.057     
                               0 ( 0.0)         3 ( 2.0)             
     No                       31 ( 8.5)        14 ( 9.2)             
     Yes                     327 (89.3)       131 (86.2)             
     NA                        8 ( 2.2)         4 ( 2.6)             
  average progesterone (%)                                 0.520     
     No                      358 (97.8)       147 (96.7)             
     Yes                       7 ( 1.9)         5 ( 3.3)             
     NA                        1 ( 0.3)         0 ( 0.0)             
#fix the part where every race is shown, probably due to meta being called
#Find out individuals who have a different vagitype across visits
uniq_vagitype <- full_set %>%
  group_by(SUBJECT_ID) %>%
  summarize(n_unique = n_distinct(vagitype))
more_vagitype <- uniq_vagitype %>%
  filter(n_unique > 1)
full_set %>%
  filter(SUBJECT_ID %in% more_vagitype$SUBJECT_ID) %>%
  group_by(SUBJECT_ID) %>%
  distinct(vagitype)
#Look at proportional changes over time, look at differences betweeen those that did change and those that did not
#Look at how they change, if they change similarly
count_table <- aggregate(list(full_set$Lactobacillus_iners, full_set$p_Iners,full_set$Lactobacillus_crispatus_cluster, full_set$p_Crisp, full_set$Gardnerella_vaginalis, full_set$p_Gard, full_set$Lactobacillus_gasseri_cluster, full_set$Gass, full_set$Lachnospiraceae_BVAB1, full_set$p_BVAB), by = list(full_set$race, full_set$reported_ga), mean)
colnames(count_table) <- c("race", "reported_ga", "Lactobacillus_iners", "p_Iners", "Crispatus", "p_Crisp", 
                           "Gardnerella", "p_Gard", "Gasseri", "p_Gass", "BVAB", "p_BVAB")
count_table <- count_table %>%
  arrange(race)
kable(count_table,  table.attr = "style = \"color: black;\"") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
race reported_ga Lactobacillus_iners p_Iners Crispatus p_Crisp Gardnerella p_Gard Gasseri p_Gass BVAB p_BVAB
african_american First Trimester 14505.96 0.2517823 8966.637 0.1340359 7985.951 0.1107511 652.6078 0.0149517 8863.471 0.1252649
african_american Second Trimester 21640.13 0.3400013 10559.085 0.1383411 5455.281 0.0905662 867.7945 0.0155340 10614.083 0.1433958
african_american Third Trimester 26061.41 0.4024149 10693.302 0.1511822 3437.836 0.0607404 1055.6137 0.0177334 7438.261 0.1027035
caucasian First Trimester 12899.98 0.1803131 24551.900 0.3699975 9399.640 0.1066708 2068.4200 0.0527995 1095.420 0.0132448
caucasian Second Trimester 12919.27 0.2402719 27804.674 0.3455106 3564.785 0.0595546 5281.7403 0.0920684 3087.961 0.0345624
caucasian Third Trimester 14819.43 0.2555657 23575.302 0.3017310 3401.765 0.0562494 4193.1306 0.0713278 1420.843 0.0170265
#Plot proportions over time, plot count over time
iners <- ggplot(count_table, aes(reported_ga, p_Iners)) + 
  geom_line(aes(group=race, color=race)) 
crisp <- ggplot(count_table, aes(reported_ga, p_Crisp)) + 
  geom_line(aes(group=race, color=race)) 
gard <- ggplot(count_table, aes(reported_ga, p_Gard)) + 
  geom_line(aes(group=race, color=race)) 
gass <- ggplot(count_table, aes(reported_ga, p_Gass)) + 
  geom_line(aes(group=race, color=race)) 
bvab <- ggplot(count_table, aes(reported_ga, p_BVAB)) + 
  geom_line(aes(group=race, color=race)) 
library(gridExtra)

Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine
ggarrange(iners, crisp, bvab, gard, gass, nrow = 3)
$`1`

$`2`

attr(,"class")
[1] "list"      "ggarrange"

#Plot individuals for each race
#ggplot(full_set, aes(reported_ga, p_Iners)) + 
#  geom_line(aes(group=SUBJECT_ID, color=race)) + 
#  facet_wrap(~race, ncol=1)
#Look at those with the specific taxa as their vagitype
prop_plot <- ggplot(full_set, aes(p_Iners)) + geom_histogram() + geom_vline(xintercept=mean(full_set$p_Iners), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$p_Iners), lwd=1, linetype=2, color="green")
#prop_plot
cprop_plot <- ggplot(full_set, aes(p_Crisp)) + geom_histogram() + geom_vline(xintercept=mean(full_set$p_Crisp), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$p_Crisp), lwd=1, linetype=2, color="green")
#iprop_plot
bprop_plot <- ggplot(full_set, aes(p_BVAB)) + geom_histogram() + geom_vline(xintercept=mean(full_set$p_BVAB), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$p_BVAB), lwd=1, linetype=2, color="green")
#pprop_plot
gprop_plot <- ggplot(full_set, aes(p_Gard)) + geom_histogram() + geom_vline(xintercept=mean(full_set$p_Gard), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$p_Gard), lwd=1, linetype=2, color="green")
#tprop_plot
gasprop_plot <- ggplot(full_set, aes(Gass)) + geom_histogram() + geom_vline(xintercept=mean(full_set$Gass), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$Gass), lwd=1, linetype=2, color="green")
#bprop_plot
#stratify by race
ggarrange(prop_plot, cprop_plot, bprop_plot, gprop_plot, gasprop_plot, labels = c("Iners","Crisp", "BVAB", "Gard", "Gass"), ncol = 3, nrow = 2)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

race_prop_plot <- ggplot(full_set, aes(p_Iners)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race) 
crace_prop_plot <- ggplot(full_set, aes(p_Crisp)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race) 
#stratify by race
brace_prop_plot <- ggplot(full_set, aes(p_BVAB)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race) 
grace_prop_plot <- ggplot(full_set, aes(p_Gard)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race)
gassrace_prop_plot <- ggplot(full_set, aes(Gass)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race)
ggarrange(race_prop_plot, crace_prop_plot, brace_prop_plot, grace_prop_plot, gassrace_prop_plot, nrow = 3)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
$`1`

$`2`

attr(,"class")
[1] "list"      "ggarrange"

#Zero-Inflated Poisson Model
z <- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + (1|SUBJECT_ID), data = full_set, family = poisson, zi = ~ vaginal_pH + reported_ga + alpha_div)
summary(z)
hist(predict(z))
plot(full_set$Lactobacillus_iners[which(names(linpred) %in% rownames(full_set))], linpred)

#summary(m1 <- zeroinfl(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div |SUBJECT_ID, data = full_set))
z <- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + (1|SUBJECT_ID), data = full_set, family = poisson, zi = ~ 0)
summary(z)
hist(predict(z))


#Zero-Inflated Negative Binomial Model
neg_bin<- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + offset(log(num_reads)) + (1|SUBJECT_ID), data = full_set, family = nbinom2, zi = ~ vaginal_pH + reported_ga + alpha_div)
summary(neg_bin)
predict(neg_bin)
neg_binone <- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + offset(log(num_reads)) + (1|SUBJECT_ID), data = full_set, family = nbinom2, zi = ~ 1)
summary(neg_binone)
hist(predict(neg_binone))
#Zero-Inflated Beta-Binomial Model
#kostic.y <- cbind(1, kostic.y=="Tumor")
beta_bin<- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set, family=betabinomial(link = "logit"), zi = ~ vaginal_pH + reported_ga + alpha_div + race)


#Mixed Linear Regression Model
mixedL <- lmer(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set)
summary(mixedL)
linpred <- predict(mixedL)
#predicted
#x-axis people
#y-axis visits for that person
plot(full_set$Lactobacillus_iners[which(names(linpred) %in% rownames(full_set))], linpred)


## Beta bimonial zero inflated ##

## NBZIMM ##
f21 = glmm.zinb(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div , random = ~ 1|SUBJECT_ID, data = full_set)
summary(f21)
summary(f21)$tTable[4, 5]
hist(full_set$p_Iners)
beta_bin<- glmmTMB(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set, family=betabinomial(link = "logit"), zi = ~ vaginal_pH + reported_ga + alpha_div + race)
mean(full_set$p_Iners)
var(full_set$p_Iners)

library(zoib)
#zoib
#zoib(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race | 1|1, data = full_set, random = 1, EUID = full_set$SUBJECT_ID)
library(aod)
full_set$p_Iners[which(full_set$p_Iners == 1)] <- 0.99999
betaIners <- glmmTMB(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set, family=beta_family(link = "logit"), zi = ~ 1)
betapred <- predict(betaIners)
summary(betaIners)
#predicted
#x-axis people
#y-axis visits for that person
#plot(full_set$p_Iners[which(names(betapred) %in% rownames(full_set))], betapred)

#zoib(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race | 1, data = full_set)


mixedL <- lmer(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set)

#par(mfrow = c(2,2))

coefplot2(mixedL)
plot(mixedL)
linpred <- predict(mixedL)

#predicted
#x-axis people
#y-axis visits for that person
plot(full_set$p_Iners[which(names(linpred) %in% rownames(full_set))], linpred)
full_set$class_Iners <- as.factor(ifelse(full_set$p_Iners > mean(full_set$p_Iners), "High", "Low"))
full_set$class_Crisp <- as.factor(ifelse(full_set$p_Crisp > mean(full_set$p_Crisp), "High", "Low"))
full_set$class_Gass <- as.factor(ifelse(full_set$Gass > mean(full_set$Gass), "High", "Low"))
full_set$class_Gard <- as.factor(ifelse(full_set$p_Gard > mean(full_set$p_Gard), "High", "Low"))
full_set$class_BVAB <- as.factor(ifelse(full_set$p_BVAB > mean(full_set$p_BVAB), "High", "Low"))
#Check median
#add line to show where each point is
#install.packages("ggpubr")
iners_plot <- ggplot(full_set, aes(class_Iners)) + geom_bar()
snea_plot <- ggplot(full_set, aes(class_Crisp)) + geom_bar() 
tm_plot <- ggplot(full_set, aes(class_Gass)) + geom_bar() 
prev_plot <- ggplot(full_set, aes(class_Gard)) + geom_bar() 
bvab_plot <- ggplot(full_set, aes(class_BVAB)) + geom_bar()

ggarrange(iners_plot, snea_plot, tm_plot, prev_plot, bvab_plot, labels = c("Iners", "Crisp", "Gass", "Gard", "BVAB1"), ncol = 3, nrow = 2)
#Distribution of High and Low (do it across race)
irace_prop_plot <- ggplot(full_set, aes(class_Iners)) + geom_bar(aes(y= stat(prop),group=race, fill=race), position = "dodge2")+ ggtitle("Iners Dichotomized Histogram")
irace_prop_plot

#Distribution of High and Low (do it across race)
srace_prop_plot <- ggplot(full_set, aes(class_Crisp)) + geom_bar(aes(y= stat(prop), group=race, fill=race), position = "dodge2") + ggtitle("Crisp Dichotomized Histogram")
#Distribution of High and Low (do it across race)
prace_prop_plot <- ggplot(full_set, aes(class_Gass)) + geom_bar(aes(y= stat(prop), group=race, fill=race), position = "dodge2") +  ggtitle("Gass Dichotomized Histogram")
#Distribution of High and Low (do it across race)
trace_prop_plot <- ggplot(full_set, aes(class_Gard)) + geom_bar(aes(y= stat(prop), group=race, fill=race), position = "dodge2") + ggtitle("Gard Dichotomized Histogram")
#Distribution of High and Low (do it across race)
brace_prop_plot <- ggplot(full_set, aes(class_BVAB)) + geom_bar(aes(y= stat(prop),group=race, fill=race), position = "dodge2") + ggtitle("BVAB Dichotomized Histogram")
ggarrange(irace_prop_plot, srace_prop_plot, prace_prop_plot, trace_prop_plot, brace_prop_plot, nrow = 3)
$`1`

$`2`

attr(,"class")
[1] "list"      "ggarrange"

#Use proportions instead of counts for better comparison, DONE
#position dodge, DONE
#Add proportion values to top of bars
#do facet wrap by taxa
covs.taxa <- function(df, taxon, model, vars){
  tax_prop <- df[,taxon]/df[,"num_reads"]
   if(model == "fixed"){
    #covs <- vars
    model_data <- df[,vars]
    half <- paste(vars, collapse= "+")
  }
  else{
    #covs <- vars
    model_data <- df[,c(vars, "SUBJECT_ID", "num_reads")]
    half <- paste(paste(vars, collapse= "+"), " + (1|SUBJECT_ID)")
  }
  model_data <- cbind(tax_prop, model_data)
  no.na.data <- na.omit(model_data)
  form <- as.formula(paste("tax_prop ~ ", half))
  if(model == "fixed"){
    mod <- lm(form, data = no.na.data)
    step.back <- stepAIC(mod, direction = "backward")
    return(step.back)
    #return(mod)
  }
  else{
    mod <- lmer(form, data = no.na.data)
    return(mod)
    #return(mod.step)
  }
}
taxa.glm <- function(df, taxon, type, model){
  tax_count <- df[,taxon]
  tax_prop <- df[,taxon]/df[,"num_reads"]
  class_tax <- as.factor(ifelse(tax_prop > mean(tax_prop), "High", "Low"))
  class_tax <- recode(class_tax, "High"= 1, "Low"= 0)
  if(model == "fixed"){
    covs <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "bv_history", "phys", "antibiotics_current","age", "education", "prenatal_care_start", "infertility_treatment", "vdischarge", "vodor", "vitching", "douche", "vaginitis_history", "current_medications", "smoker_current", "alcohol_frequency_year", "diabetes", "no_false_labor", "no_high_bp", "no_vbleeding", "progesterone")
    model_data <- df[,covs]
    half <- paste(covs, collapse= "+")
  }
  else{
    covs <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "bv_history", "phys", "antibiotics_current","age", "education", "prenatal_care_start", "infertility_treatment", "vdischarge", "vodor", "vitching", "douche", "vaginitis_history", "current_medications", "smoker_current", "alcohol_frequency_year", "diabetes", "no_false_labor", "no_high_bp", "no_vbleeding", "progesterone")
    model_data <- df[,c(covs, "SUBJECT_ID", "num_reads")]
    half <- paste(paste(covs, collapse= "+"), " + (1|SUBJECT_ID)")
  }
  if(type == "count"){
    model_data <- cbind(tax_count, model_data)
    no.na.data <- na.omit(model_data)
    form <- as.formula(paste("tax_count ~ ", half))
    if(model == "fixed"){
      mod <- glm.nb(form, data = no.na.data)
    }
    else{
      mod <- glmer.nb(form, data = no.na.data)
    }
  }
  else if(type == "prop"){
    model_data <- cbind(tax_prop, model_data)
    no.na.data <- na.omit(model_data)
    form <- as.formula(paste("tax_prop ~ ", half))
    if(model == "fixed"){
      mod <- lm(form, data = no.na.data)
      #return(mod)
    }
    else{
      mod <- lmer(form, data = no.na.data)
      return(mod)
      #return(mod.step)
    }
  }
  else{
    model_data <- cbind(class_tax, model_data)
    no.na.data <- na.omit(model_data)
    form <- as.formula(paste("class_tax ~ ", paste(covs, collapse= "+")))
    if(model == "fixed"){
      mod <- glm(form, data = no.na.data, family = binomial)
    }
    else{
      mod <- glmer(form, data = no.na.data, family = binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
    }
  }
  #step.model <- stepAIC(fixed_mod, direction = "both", trace = FALSE)
  step.back <- stepAIC(mod, direction = "backward", trace = FALSE)
  return(step.back)
  #TO ADD: Plot for each covariate the predicted distribution
  #Output table of common variables, extract p-value
}
model_plot <- function(model){
  final_covs <- names(model$model)[-1]
  print(deparse(substitute(model)))
  print(as.formula(paste("class_tax ~ ", paste(final_covs, collapse= "+"))))
  #Plot AIC for model
  vars <- unlist(lapply(strsplit(as.character(model$anova$Step), '- ', fixed = TRUE), '[', 2))
  vars[1] <- "Intercept"
  AIC.res <- data.frame(Variable = vars, AIC = model$anova$AIC)
  AIC.res$Variable <- factor(AIC.res$Variable, levels = vars)
  #true variables
  #true_vars <- paste0("var", 1:10)
  #colour_vars <- ifelse(AIC.res$Variable %in% true_vars, 'red','black')
  plot <- ggplot(data=AIC.res, aes(x=Variable, y=AIC, group=1)) +
    geom_line(color = "red")+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 60, hjust = 1))
  return(plot)
}
pred_plot <- function(df, model){
  model_data <- df[,covs]
  model_data <- cbind(class_tax, model_data)
  no.na.data <- na.omit(model_data)
  final_covs <- names(model$model)[-1]
  pred_data <- no.na.data[,final_covs]
  return(predict(model, newdata=pred_data, type = "response"))
  
  
  
  # if(c("alpha_div","vagitype") %in% names(model$model)){
  #   plotting_dfm <- expand.grid(alpha_div = seq(0, 3.5, 0.01),
  #                          vagitype = c("Lactobacillus_crispatus_cluster", "Lactobacillus_iners", "Lachnospiraceae_BVAB1", "Gardnerella_vaginalis", "Lactobacillus_gasseri_cluster"))
  #   plotting_dfm$preds <- predict(model, newdata=plotting_dfm, type = "response")
  #   pl <- ggplot(plotting_dfm, aes(x=alpha_div, y =preds, color=as.factor(vagitype))) +
  #     geom_point( ) +
  #     ggtitle("Predicted High/Low by Alpha_Div and Vagitype")
  #   return(p1)
  # }
  # else{
  #   return ("No value")
  # }
}
get_data <- function(taxon, df){
  tax_prop <- df[,taxon]/df[,"num_reads"]
  covs <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "bv_history", "phys", "antibiotics_current","age", "education", "prenatal_care_start", "infertility_treatment", "vdischarge", "vodor", "vitching", "douche", "vaginitis_history", "current_medications", "smoker_current", "alcohol_frequency_year", "diabetes", "no_false_labor", "no_high_bp", "no_vbleeding", "progesterone")
  model_data <- df[,c(covs, "SUBJECT_ID", "num_reads")]
  model_data <- cbind(tax_prop, model_data)
  no.na.data <- na.omit(model_data)
  return(no.na.data)
}
make_hists<- function(model, taxon, df, vars){
  hists <- data.frame(xx = c(predict(model, newdata = get_data(taxon, df)[, vars]), get_data(taxon, df)[, "tax_prop"]), yy = rep(c("pred", "actual"),each = length(get_data(taxon, df)[, "tax_prop"])))
  return(ggplot(hists, aes(x=xx, fill=yy)) + geom_histogram(alpha=0.2, position="identity"))
}
agg_plot <- function(model, taxon, df, vars){
  ex <- data.frame(data = c(predict(model, newdata = get_data(taxon, df)[, vars]), get_data(taxon, df)[, "tax_prop"]), 
                   race = get_data(taxon, df)[, "race"], tri = get_data(taxon, df)[, "reported_ga"],
                   yy = rep(c("pred", "actual"),each = length(get_data(taxon, df)[, "tax_prop"])))
  agg_table <- aggregate(list(ex$data), by = list(ex$race, ex$tri, ex$yy), mean)
  colnames(agg_table) <- c("race", "tri", "type", "data")
  agg_plot <- ggplot(agg_table, aes(tri, data)) + 
geom_line(aes(group=interaction(race, type), color=race, linetype = type)) + 
    scale_x_discrete(labels=c("First Trimester" = "1st", "Second Trimester" = "2nd",
                              "Third Trimester" = "3rd")) + 
    ylab("Prop")
  return(agg_plot)
}
getPerformace <- function(model, df){
  taxa <- colnames(t_otu)
  example <- c("Lactobacillus_iners", "Lactobacillus_crispatus_cluster", "Gardnerella_vaginalis", "Lachnospiraceae_BVAB1", "Lactobacillus_gasseri_cluster", "Sneathia_amnii", "Prevotella_cluster2", "TM7_OTU-H1")
  for (name in example){
    md <- covs.taxa
  }
  perf <- model_performance(model, metrics = "all", verbose = TRUE)
  return(perf)
  
}
subsetFull <- function(df, taxalist){
  `%ni%` <- Negate(`%in%`)
  cols <- sapply(taxalist, function (x) any(df[,x]/df$num_reads > 0.05))
  df <- subset(df,select = names(df) %ni% names(which(cols == FALSE)))
  return(df)
}
  
covs <- c("vaginal_pH", "alpha_div", "reported_ga", "race", "income", "bv_history", "phys", "antibiotics_current","age", "education", "prenatal_care_start", "infertility_treatment", "vdischarge", "vodor", "vitching", "douche", "vaginitis_history", "current_medications", "smoker_current", "alcohol_frequency_year", "diabetes", "no_false_labor", "no_high_bp", "no_vbleeding", "progesterone")
model_data <- full_set[,covs]
ggpairs(model_data, columns = 1:2, ggplot2::aes(colour=race)) #too many points, too many vagitypes as well so had to remove
#do top 5 vagitypes
aa <- full_set[full_set$race == 'african_american',]
histogram(aa$vaginal_pH)
count_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "count", "fixed") #Does not converge
prop_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "prop", "fixed")
class_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "class", "fixed")
mcount_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "count", "mixed") #Does not converge
mprop_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "prop", "mixed") #failure to converge in 10000 evaluations
mod_Crisp <- taxa.glm(full_set, "Lactobacillus_crispatus_cluster", "class", "fixed")
mod_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "class", "fixed")
mod_BVAB <- taxa.glm(full_set, "Lachnospiraceae_BVAB1", "class", "fixed")
mod_Gard <- taxa.glm(full_set, "Gardnerella_vaginalis", "class", "fixed")
mod_Gass <- taxa.glm(full_set, "Lactobacillus_gasseri_cluster", "class", "fixed")


taxalist <- c("Lactobacillus_crispatus_cluster", "Lactobacillus_iners", "Lachnospiraceae_BVAB1", "Gardnerella_vaginalis", "Lactobacillus_gasseri_cluster")
var_df <- data.frame(matrix(ncol = 15, nrow = 15))
#colnames(var_df) <- taxalist
count <- 0
for(ind in c(1:length(taxalist))){
  mods <- taxa.glm(full_set, taxalist[ind], "prop", "mixed")
  no.na.data <- get_data(taxalist[ind], full_set)
  step.back <- lmerTest::step(mods)
  fix_vars <- c(rownames(step.back$fixed[which(step.back$fixed$Eliminated == 0),]), rownames(step.back$random[which(step.back$random$Eliminated == 0),]))
  for(i in c(1:length(fix_vars))){
    var_df[i,ind+count] <- fix_vars[i]
  }
  fixed_mods <- taxa.glm(full_set, taxalist[ind], "prop", "fixed")
  fixed_vars <- names(fixed_mods$model)[-1]
  for(i in c(1:length(fixed_vars))){
    var_df[i,ind+1+count] <- fixed_vars[i]
  }
  class_mods <- taxa.glm(full_set, taxalist[ind], "class", "fixed")
  class_vars <- names(class_mods$model)[-1]
  for(i in c(1:length(class_vars))){
    var_df[i,ind+2+count] <- class_vars[i]
  }
  count <- count + 2
}

colnames(var_df) <- c("Crisp_P_M", "Crisp_P_F", "Crisp_C_F", "Iners_P_M", "Iners_P_F","Iners_C_F", "BVAB1_P_M", "BVAB1_P_F","BVAB1_C_F","Gard_P_M", "Gard_P_F", "Gard_C_F","Gass_P_M","Gass_P_F", "Gass_C_F")
options(knitr.kable.NA = '')
kable(var_df, table.attr = "style = \"color: black;\"") %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  add_header_above(c("Crispatus" = 3, "Iners" = 3, "BVAB" = 3, "Gardnerella" = 3, "Gasseri" = 3))
#Make table for the variables that are outputted
#need the data frame
#alpha div, vagitype, age, smoking, education, subject_id, vaginal_ph, race
#look at covariate estimates for each taxa model, consider correlations in model
#forward model selection, one variable addition at a time
#look at differences between taxa
#look for vagitypes in rare taxa
#performance with and without vagitype, compare using model comparison,ANOVA DONE
#overlay averages across each race across the 3 trimesters, output model metrics, try without and with vagitype: WROTE FUNCTION, APPLY TO ALL TAXA
#how much does vagitype add in terms of prediction, PLOTS DONE
#make summaries
#pca under both smoothed and unsmoother conditions
#prcomp on raw data, scale variables to same unit

vars <- c("vaginal_pH", "alpha_div", "race", "vagitype", "age", "education", "smoker_current")
mx_vars<- c("vaginal_pH", "alpha_div", "race", "vagitype", "age", "education", "smoker_current", "reported_ga","SUBJECT_ID")
mx_type <- c("vaginal_pH", "alpha_div", "race", "age", "education", "smoker_current", "reported_ga","SUBJECT_ID")
no_type <- c("vaginal_pH", "alpha_div", "race", "age", "education", "smoker_current")

#predict based on these models

taxalist <- c("Lactobacillus_iners", "Lactobacillus_crispatus_cluster", "Gardnerella_vaginalis", "Lachnospiraceae_BVAB1", "Lactobacillus_gasseri_cluster", "Sneathia_amnii", "Prevotella_cluster2", "TM7_OTU-H1")
tax_plots <- list()
tax_hists <- list()

#taxalist <- c("Lactobacillus_iners", "Lactobacillus_crispatus_cluster")
for (taxa in taxalist){
 mixed_full <- covs.taxa(full_set, taxa, "mixed", vars)
 mixed_notype <- covs.taxa(full_set, taxa, "mixed", no_type)
 print(taxa)
 print(anova(mixed_full, mixed_notype))
 #print(summary(mixed_notype))
 var = paste(taxa, "Full", sep = "_")
 var2 = paste(taxa, "NoType", sep = "_")
 tax_plots[[var]] <- agg_plot(mixed_full, taxa, full_set, mx_vars) + ggtitle(var)
 tax_plots[[var2]] <- agg_plot(mixed_notype, taxa, full_set, mx_type) + ggtitle(var2)
 tax_hists[[var]] <- make_hists(mixed_full, taxa, full_set, mx_vars) + ggtitle(var)
 tax_hists[[var2]] <- make_hists(mixed_notype, taxa, full_set, mx_type) + ggtitle(var2)
}
new_list = c("Lactobacillus_iners", "Lactobacillus_crispatus_cluster", "Gardnerella_vaginalis", "Lachnospiraceae_BVAB1", "Lactobacillus_gasseri_cluster")
perf <- data.frame()
predFrame <- data.frame()
see <- subsetFull(full_set, names(t_otu))
updated <- t_otu[,(names(t_otu) %in% names(see))]

#comb <- data.frame()
for (taxa in names(updated)){
  mixed_full <- covs.taxa(see, taxa, "mixed", vars)
  if (taxa %in% new_list){
    orig <- paste(taxa, "Orig", sep = "_")
    predicted <- paste(taxa, "Pred", sep = "_")
    pre_dat <- getData(mixed_full)
    pred <- predict(mixed_full)
    comb[1:length(pred),c(orig, predicted)] <- cbind(pre_dat[,c("tax_prop", "SUBJECT_ID")], pred)
    # comb <- comb %>%
    #   rename(!!orig := tax_prop, !!predicted := pred)
  }
  var = paste(taxa, "Full", sep = "_")
  predFrame[1:length(predict(mixed_full)),taxa] <- predict(mixed_full)
  perf[taxa,names(model_performance(mixed_full, metrics = "all", verbose = TRUE))] <- model_performance(mixed_full, metrics = "all", verbose = TRUE)
}
predFrame[predFrame < 0] <- 0
predFrame[predFrame > 1] <- 1
comb[comb < 0] <- 0
comb[comb > 1] <- 1
comb <- comb[ , c(2, 1, 3:ncol(comb))]
no_na <- na.omit(full_set[, vars])
pred_Full <- cbind(predFrame, no_na)
#Get the actual from the initial model
#Add in subject_id
#Truncate, DONE
perf$rankRMSE <- rank(perf$RMSE)
perf$rankR2Cond <- rank(-perf$R2_conditional)
perf$Name <- rownames(perf)
kable(perf,  table.attr = "style = \"color: black;\"") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

AIC BIC R2_conditional R2_marginal ICC RMSE rankRMSE rankR2Cond Name
Acidovorax_delafieldii_temperans -17387.410 -16959.236 0.9271641 0.2295530 0.9054628 0.0007071 8 14 Acidovorax_delafieldii_temperans
Actinobaculum_massiliense -15720.057 -15291.883 0.1877250 0.1087328 0.0886291 0.0020474 43 112 Actinobaculum_massiliense
Actinomyces_meyeri_odontolyticus -16192.741 -15764.567 0.2221724 0.2200070 0.0027761 0.0019171 40 107 Actinomyces_meyeri_odontolyticus
Actinomyces_oris -17914.653 -17486.479 NA 0.2045212 NA 0.0011628 17 134 Actinomyces_oris
Actinomyces_turicensis -17145.296 -16717.122 0.7263134 0.7259751 0.0012345 0.0014539 26 32 Actinomyces_turicensis
Actinomyces_viscosus -23121.373 -22693.199 NA 0.9991278 NA 0.0002544 3 135 Actinomyces_viscosus
Aerococcus_christensenii -11549.295 -11121.121 0.3151074 0.1628666 0.1818596 0.0064874 108 96 Aerococcus_christensenii
Alistipes_putredinis -17319.223 -16891.049 0.2937339 0.0347617 0.2682988 0.0011426 16 101 Alistipes_putredinis
Alloprevotella_tannerae -17248.765 -16820.591 0.6059447 0.6032216 0.0068629 0.0014031 24 55 Alloprevotella_tannerae
Alloscardovia_omnicolens -19051.850 -18623.676 0.9764949 0.9704455 0.2046855 0.0007158 9 9 Alloscardovia_omnicolens
Anaerococcus_hydrogenalis -14922.757 -14494.582 0.3844899 0.3349658 0.0744685 0.0026119 55 83 Anaerococcus_hydrogenalis
Anaerococcus_lactolyticus_cluster -12227.660 -11799.486 0.2947407 0.1989324 0.1196008 0.0055480 98 100 Anaerococcus_lactolyticus_cluster
Anaerococcus_OTU147 -13373.748 -12945.574 0.6903183 0.6016931 0.2225047 0.0037140 77 41 Anaerococcus_OTU147
Anaerococcus_OTU153 -13596.832 -13168.658 0.0251648 0.0222535 0.0029775 0.0040891 85 132 Anaerococcus_OTU153
Anaerococcus_OTU5 -14646.079 -14217.905 0.4881823 0.4521661 0.0657429 0.0028510 57 73 Anaerococcus_OTU5
Anaerococcus_tetradius -14025.857 -13597.683 0.2405319 0.1324379 0.1245951 0.0032709 68 106 Anaerococcus_tetradius
Anaerococcus_vaginalis -14494.086 -14065.912 0.2064371 0.2048119 0.0020438 0.0031497 66 109 Anaerococcus_vaginalis
Anaeroglobus_geminatus -21037.536 -20609.362 0.9351840 0.9272559 0.1089851 0.0004272 6 13 Anaeroglobus_geminatus
Arthrobacter_albus_cumminsii -14539.968 -14111.794 0.2168729 0.1840776 0.0401941 0.0030028 61 108 Arthrobacter_albus_cumminsii
Atopobium_rimae -15363.122 -14934.948 NA 0.0219892 NA 0.0024489 53 136 Atopobium_rimae
Atopobium_vaginae -5190.115 -4761.940 0.7177193 0.6678172 0.1502248 0.0423779 143 38 Atopobium_vaginae
Bacteroides_cellulosilyticus_intestinalis_oleiciplenus -19367.897 -18939.723 0.9865959 0.9864790 0.0086461 0.0007546 10 7 Bacteroides_cellulosilyticus_intestinalis_oleiciplenus
Bacteroides_coagulans -13653.838 -13225.664 0.4043407 0.3414861 0.0954491 0.0037232 78 79 Bacteroides_coagulans
Bacteroides_fragilis -17094.283 -16666.109 0.9927665 0.9448892 0.8687463 0.0008135 11 4 Bacteroides_fragilis
Bacteroides_massiliensis -15873.587 -15445.413 0.3121851 0.3077049 0.0064716 0.0020969 46 97 Bacteroides_massiliensis
Bacteroides_stercoris -14477.036 -14048.861 0.0297209 0.0267779 0.0030240 0.0031625 67 130 Bacteroides_stercoris
Bacteroides_uniformis -16756.746 -16328.572 0.0933740 0.0891582 0.0046285 0.0016232 32 122 Bacteroides_uniformis
Bacteroides_vulgatus_dorei -12015.922 -11587.748 0.7287686 0.7255546 0.0117110 0.0064336 107 30 Bacteroides_vulgatus_dorei
Bergeyella_sp_oral_taxon_931 -12982.393 -12554.219 0.3028418 0.0359770 0.2768241 0.0040319 84 99 Bergeyella_sp_oral_taxon_931
Bifidobacterium_bifidum -13471.207 -13043.033 0.1020659 0.0750877 0.0291684 0.0041413 86 121 Bifidobacterium_bifidum
Bifidobacterium_breve_cluster -11078.986 -10650.811 0.7760194 0.7646163 0.0484448 0.0081894 116 27 Bifidobacterium_breve_cluster
Bifidobacterium_dentium -14408.210 -13980.035 0.3088783 0.0270391 0.2896716 0.0026397 56 98 Bifidobacterium_dentium
Bifidobacterium_longum_infantis_suis -11635.179 -11207.005 0.9627866 0.9591566 0.0888780 0.0067442 112 10 Bifidobacterium_longum_infantis_suis
Bosea_cluster53 -15841.045 -15412.871 0.0337541 0.0094344 0.0245513 0.0020821 44 129 Bosea_cluster53
Bradyrhizobiaceae_cluster49 -15389.722 -14961.548 0.9070597 0.9044323 0.0274927 0.0023691 51 15 Bradyrhizobiaceae_cluster49
Brevibacterium_ravenspurgense -16031.641 -15603.467 0.2047684 0.0859569 0.1299846 0.0018146 38 110 Brevibacterium_ravenspurgense
Campylobacter_hominis -15670.135 -15241.961 0.7217163 0.7027838 0.0636992 0.0021178 47 36 Campylobacter_hominis
Campylobacter_ureolyticus -10874.731 -10446.557 0.3750741 0.3215450 0.0788985 0.0084842 117 84 Campylobacter_ureolyticus
Caulobacteraceae_cluster34 -18483.008 -18054.834 0.2406715 0.2391469 0.0020038 0.0009832 13 105 Caulobacteraceae_cluster34
Clostridiaceae_1_OTU17 -14981.602 -14553.428 0.4267996 0.0927837 0.3681767 0.0021368 48 77 Clostridiaceae_1_OTU17
Clostridiales_BVAB2 -10144.740 -9716.565 0.3568899 0.1961364 0.1999760 0.0096642 119 87 Clostridiales_BVAB2
Clostridiales_BVAB3 -15805.018 -15376.844 0.5740039 0.5219274 0.1089300 0.0019677 42 62 Clostridiales_BVAB3
Clostridiales_OTU22 -11204.319 -10776.145 0.4332410 0.3862199 0.0766091 0.0077196 115 76 Clostridiales_OTU22
Clostridium_colicanis -16601.161 -16172.987 0.5398366 0.1221161 0.4758266 0.0012553 20 63 Clostridium_colicanis
Coriobacteriaceae_OTU27 -11870.674 -11442.500 0.3226097 0.2985736 0.0342674 0.0065781 110 92 Coriobacteriaceae_OTU27
Corynebacterium_aurimucosum_nigricans -12280.496 -11852.322 0.6031171 0.5820755 0.0503479 0.0057579 101 56 Corynebacterium_aurimucosum_nigricans
Corynebacterium_cluster45 -9984.212 -9556.037 0.6365505 0.6072596 0.0745807 0.0110394 123 49 Corynebacterium_cluster45
Corynebacterium_cluster58 -10123.409 -9695.234 0.6697698 0.6390697 0.0850582 0.0105152 122 44 Corynebacterium_cluster58
Corynebacterium_coyleae -13420.777 -12992.603 0.2844032 0.1643998 0.1436135 0.0038522 81 102 Corynebacterium_coyleae
Corynebacterium_imitans_lipophiloflavum -12886.473 -12458.299 0.1650645 0.1366723 0.0328869 0.0048961 92 116 Corynebacterium_imitans_lipophiloflavum
Corynebacterium_pyruviciproducens_cluster -13655.134 -13226.960 0.5125039 0.4412694 0.1274934 0.0036374 76 68 Corynebacterium_pyruviciproducens_cluster
Corynebacterium_simulans_striatum -16261.320 -15833.146 0.1337907 0.0947030 0.0431767 0.0018123 37 119 Corynebacterium_simulans_striatum
Corynebacterium_thomssenii_sundsvallense -11479.381 -11051.207 0.5206713 0.3074384 0.3078901 0.0061423 105 66 Corynebacterium_thomssenii_sundsvallense
Corynebacterium_urealyticum -17645.668 -17217.494 0.0854713 0.0692736 0.0174033 0.0012375 19 124 Corynebacterium_urealyticum
Delftia_acidovorans_lacustris_tsuruhatensis -39739.438 -39311.264 1.0000000 1.0000000 0.6269149 0.0000013 1 1 Delftia_acidovorans_lacustris_tsuruhatensis
Dialister_cluster51 -10347.769 -9919.595 0.5386496 0.4972450 0.0823553 0.0098688 121 64 Dialister_cluster51
Dialister_invisus -18254.521 -17826.347 0.8110256 0.7648935 0.1962178 0.0009081 12 22 Dialister_invisus
Dialister_micraerophilus -11942.287 -11514.113 0.3590883 0.2684235 0.1239308 0.0060117 104 85 Dialister_micraerophilus
Dialister_propionicifaciens -12741.237 -12313.063 0.4948088 0.3573199 0.2139305 0.0044904 91 71 Dialister_propionicifaciens
Dorea_longicatena -14479.415 -14051.241 0.1611618 0.0676141 0.1003315 0.0029154 59 117 Dorea_longicatena
Enterobacteriaceae_cluster31 -8430.659 -8002.485 0.6906619 0.6890979 0.0050307 0.0184357 135 40 Enterobacteriaceae_cluster31
Enterococcus_faecalis -15801.895 -15373.721 0.9478028 0.9461683 0.0303633 0.0020952 45 11 Enterococcus_faecalis
Ezakiella_peruensis -13682.122 -13253.948 0.6743047 0.4525639 0.4050533 0.0030598 62 42 Ezakiella_peruensis
Faecalibacterium_prausnitzii -19839.623 -19411.449 0.9836472 0.9834699 0.0107273 0.0006563 7 8 Faecalibacterium_prausnitzii
Fenollaria_timonensis -15767.716 -15339.542 0.6089678 0.4477681 0.2919058 0.0017728 34 54 Fenollaria_timonensis
Finegoldia_magna -9461.447 -9033.272 0.4884036 0.4479662 0.0732517 0.0128724 130 72 Finegoldia_magna
Fusobacterium_cluster48 -11778.202 -11350.028 0.5181079 0.4763256 0.0797868 0.0065131 109 67 Fusobacterium_cluster48
Fusobacterium_gonidiaformans_equinum -13511.169 -13082.995 0.6135764 0.6083021 0.0134652 0.0041515 87 52 Fusobacterium_gonidiaformans_equinum
Gardnerella_vaginalis -3526.012 -3097.837 0.7247044 0.5923336 0.3247037 0.0620045 145 33 Gardnerella_vaginalis
Gemella_morbillorum_sanguinis_haemolysans -11713.301 -11285.127 0.3567435 0.3519466 0.0074020 0.0070561 113 88 Gemella_morbillorum_sanguinis_haemolysans
Gemella_OTU86 -12681.749 -12253.575 NA 0.6045299 NA 0.0053563 97 137 Gemella_OTU86
Haemophilus_cluster60 -12617.651 -12189.477 0.1875526 0.1609860 0.0316640 0.0053014 95 113 Haemophilus_cluster60
Helcococcus_seattlensis -17137.023 -16708.848 NA 0.6058945 NA 0.0014591 27 138 Helcococcus_seattlensis
Lachnospiraceae_BVAB1 -3634.694 -3206.520 0.8921398 0.8758694 0.1310750 0.0676006 146 18 Lachnospiraceae_BVAB1
Lachnospiraceae_OTU33 -16458.715 -16030.541 0.3214496 0.1890243 0.1632913 0.0015665 28 93 Lachnospiraceae_OTU33
Lactobacillus_crispatus_cluster -3036.883 -2608.709 0.9435135 0.9278223 0.2173966 0.0761257 147 12 Lactobacillus_crispatus_cluster
Lactobacillus_delbrueckii -12241.633 -11813.459 0.9905929 0.9662504 0.7212673 0.0038535 82 5 Lactobacillus_delbrueckii
Lactobacillus_gasseri_cluster -4971.098 -4542.924 0.8658581 0.7717616 0.4122726 0.0387450 142 20 Lactobacillus_gasseri_cluster
Lactobacillus_iners -1813.793 -1385.619 0.8925608 0.8564098 0.2517650 0.1065701 148 17 Lactobacillus_iners
Lactobacillus_jensenii -4259.587 -3831.413 0.7961473 0.6668447 0.3881152 0.0483247 144 24 Lactobacillus_jensenii
Megasphaera_OTU70_type1 -7467.563 -7039.389 0.3477399 0.2288477 0.1541748 0.0217420 137 89 Megasphaera_OTU70_type1
Megasphaera_OTU71_type2 -11234.891 -10806.717 0.5791815 0.5164580 0.1297169 0.0073608 114 60 Megasphaera_OTU71_type2
Metaprevotella_massiliensis -13811.086 -13382.912 0.3464414 0.3280576 0.0273591 0.0037562 79 90 Metaprevotella_massiliensis
Moraxella_osloensis -13329.425 -12901.251 0.0088088 0.0032703 0.0055567 0.0044102 90 133 Moraxella_osloensis
Morganella_morganii -18406.099 -17977.925 0.9930131 0.9930050 0.0011503 0.0010063 14 3 Morganella_morganii
Moryella_indoligenes -13804.123 -13375.949 0.3904462 0.3883232 0.0034708 0.0038472 80 82 Moryella_indoligenes
Mycoplasma_girerdii -8433.902 -8005.727 NA 0.8597742 NA 0.0185073 136 139 Mycoplasma_girerdii
Mycoplasma_hominis -8821.780 -8393.606 0.7451051 0.7057186 0.1338397 0.0148453 131 29 Mycoplasma_hominis
Negativicoccus_succinicivorans -14828.361 -14400.187 0.1354207 0.1343746 0.0012086 0.0028592 58 118 Negativicoccus_succinicivorans
Neisseria_flava/mucosa/pharyngis/sicca -16846.954 -16418.780 0.7819320 0.7803593 0.0071605 0.0015772 29 26 Neisseria_flava/mucosa/pharyngis/sicca
Neisseria_flavescens -15206.405 -14778.231 0.0809069 0.0774925 0.0037012 0.0025544 54 126 Neisseria_flavescens
Oligella_urethralis -13361.349 -12933.175 0.7244324 0.7211371 0.0118168 0.0043437 89 34 Oligella_urethralis
Parasutterella_excrementihominis -21400.997 -20972.823 0.9883619 0.9880825 0.0234445 0.0004113 5 6 Parasutterella_excrementihominis
Parvimonas_OTU142 -14041.327 -13613.153 0.3307974 0.2446606 0.1140372 0.0032805 69 91 Parvimonas_OTU142
Peptoniphilus_coxii -17929.505 -17501.330 0.3169154 0.2017410 0.1442820 0.0010327 15 94 Peptoniphilus_coxii
Peptoniphilus_duerdenii -14219.740 -13791.566 0.6622962 0.5918521 0.1725946 0.0029932 60 45 Peptoniphilus_duerdenii
Peptoniphilus_harei -14431.342 -14003.168 0.4231846 0.3902277 0.0540479 0.0030640 63 78 Peptoniphilus_harei
Peptoniphilus_indolicus -13371.390 -12943.216 0.5904156 0.5097682 0.1645088 0.0038544 83 57 Peptoniphilus_indolicus
Peptoniphilus_urinimassiliensis -16674.119 -16245.944 NA 0.6422790 NA 0.0016702 33 140 Peptoniphilus_urinimassiliensis
Peptostreptococcus_anaerobius -15527.769 -15099.594 0.7970001 0.7961226 0.0043040 0.0023243 50 23 Peptostreptococcus_anaerobius
Porphyromonadaceae_OTU134 -13993.871 -13565.697 0.6371127 0.6003435 0.0920020 0.0033800 70 48 Porphyromonadaceae_OTU134
Porphyromonas_bennonis -12350.176 -11922.002 0.4955297 0.4817128 0.0266589 0.0057571 100 70 Porphyromonas_bennonis
Porphyromonas_endodontalis -17026.343 -16598.169 0.7710487 0.7568240 0.0584954 0.0014314 25 28 Porphyromonas_endodontalis
Porphyromonas_somerae -12378.075 -11949.901 0.6273122 0.6170711 0.0267441 0.0057100 99 51 Porphyromonas_somerae
Porphyromonas_uenonis_asaccharolytica -16090.012 -15661.838 0.8251491 0.1553773 0.7929834 0.0011826 18 21 Porphyromonas_uenonis_asaccharolytica
Prevotella_amnii -8677.040 -8248.866 0.4499903 0.4229423 0.0468722 0.0165311 132 74 Prevotella_amnii
Prevotella_bergensis -16186.624 -15758.450 NA 0.1950469 NA 0.0019256 41 141 Prevotella_bergensis
Prevotella_bivia -6950.805 -6522.630 0.6510750 0.6285707 0.0605885 0.0270558 139 47 Prevotella_bivia
Prevotella_cluster2 -5822.643 -5394.469 0.7178322 0.6851120 0.1039107 0.0363862 141 37 Prevotella_cluster2
Prevotella_cluster50 -10235.123 -9806.949 0.4344745 0.2877211 0.2060336 0.0093773 118 75 Prevotella_cluster50
Prevotella_copri -17403.038 -16974.864 NA 0.8906897 NA 0.0013501 23 142 Prevotella_copri
Prevotella_corporis -11738.854 -11310.680 0.6731350 0.6273574 0.1228460 0.0063843 106 43 Prevotella_corporis
Prevotella_denticola -12693.957 -12265.782 0.1223159 0.0517441 0.0744227 0.0050061 93 120 Prevotella_denticola
Prevotella_disiens -9677.822 -9249.648 0.5740524 0.5239280 0.1052875 0.0117979 129 61 Prevotella_disiens
Prevotella_melaninogenica_cluster -10027.073 -9598.899 0.5069715 0.4842113 0.0441269 0.0111728 126 69 Prevotella_melaninogenica_cluster
Prevotella_multiformis -16367.291 -15939.117 0.0258672 0.0167734 0.0092490 0.0018107 36 131 Prevotella_multiformis
Prevotella_oris -13855.232 -13427.058 0.5802765 0.5437766 0.0800044 0.0035516 72 59 Prevotella_oris
Prevotella_OTU-T3 -12322.620 -11894.446 0.1655112 0.1411768 0.0283346 0.0057950 103 115 Prevotella_OTU-T3
Prevotella_OTU44 -13504.712 -13076.538 0.3165165 0.1354243 0.2094579 0.0036033 75 95 Prevotella_OTU44
Prevotella_OTU49 -15416.529 -14988.355 0.0825019 0.0675626 0.0160218 0.0023749 52 125 Prevotella_OTU49
Prevotella_OTU54 -17478.489 -17050.315 0.0921929 0.0830236 0.0099996 0.0013082 21 123 Prevotella_OTU54
Prevotella_pleuritidis -16390.591 -15962.417 0.1967202 0.1876299 0.0111898 0.0017952 35 111 Prevotella_pleuritidis
Prevotellaceae_OTU61 -13630.966 -13202.792 0.2570843 0.1103191 0.1649638 0.0035721 74 104 Prevotellaceae_OTU61
Propionibacterium_acnes -12343.547 -11915.373 0.7890623 0.7835592 0.0254255 0.0057745 102 25 Propionibacterium_acnes
Propionibacterium_avidum_propionicum -22634.687 -22206.513 0.5900243 0.5700952 0.0463571 0.0002813 4 58 Propionibacterium_avidum_propionicum
Propionimicrobium_lymphophilum -15415.931 -14987.757 0.3580599 0.2975746 0.0861092 0.0022417 49 86 Propionimicrobium_lymphophilum
Proteobacteria_OTU-T1 -11803.483 -11375.309 0.3916603 0.0059571 0.3880146 0.0053444 96 81 Proteobacteria_OTU-T1
Pseudomonas_aeruginosa -28946.442 -28518.268 0.9994494 0.9994403 0.0161400 0.0000458 2 2 Pseudomonas_aeruginosa
Pseudomonas_gessardii_cluster -10051.331 -9623.157 NA 0.7282996 NA 0.0115428 128 143 Pseudomonas_gessardii_cluster
Pyramidobacter_sp -17463.554 -17035.380 NA 0.1746499 NA 0.0013265 22 144 Pyramidobacter_sp
Rothia_mucilaginosa -14543.629 -14115.455 0.3920836 0.3891579 0.0047896 0.0030964 64 80 Rothia_mucilaginosa
Shuttleworthia_satelles -16847.007 -16418.833 NA 0.0235293 NA 0.0015880 30 145 Shuttleworthia_satelles
Sneathia_amnii -6506.739 -6078.565 0.6302520 0.5494910 0.1792661 0.0283143 140 50 Sneathia_amnii
Sneathia_OTU65 -15287.130 -14858.956 0.7278586 0.0364789 0.7175554 0.0015885 31 31 Sneathia_OTU65
Sneathia_sanguinegens -9242.268 -8814.094 0.6103573 0.3267148 0.4212813 0.0110825 124 53 Sneathia_sanguinegens
Staphylococcus_cluster47 -9972.464 -9544.290 0.9052482 0.9025158 0.0280285 0.0115103 127 16 Staphylococcus_cluster47
Stenotrophomonas_rhizophila -14132.455 -13704.281 0.2580453 0.2554337 0.0035075 0.0034955 71 103 Stenotrophomonas_rhizophila
Streptococcus_agalactiae -9996.456 -9568.282 0.8911707 0.8841266 0.0607908 0.0111201 125 19 Streptococcus_agalactiae
Streptococcus_anginosus -10201.064 -9772.889 0.7218227 0.6671497 0.1642571 0.0097254 120 35 Streptococcus_anginosus
Streptococcus_cluster29 -7157.037 -6728.863 NA 0.7616446 NA 0.0268661 138 146 Streptococcus_cluster29
Streptococcus_parasanguinis -14245.346 -13817.172 0.1813025 0.0935078 0.0968511 0.0031295 65 114 Streptococcus_parasanguinis
Streptococcus_salivarius_thermophilus_vestibularis -12741.409 -12313.235 NA 0.2848975 NA 0.0052639 94 147 Streptococcus_salivarius_thermophilus_vestibularis
Sutterella_sanguinis -16261.200 -15833.026 0.0779703 0.0603826 0.0187179 0.0018514 39 127 Sutterella_sanguinis
TM7_OTU-H1 -8029.654 -7601.480 0.5272706 0.4295954 0.1712384 0.0182472 134 65 TM7_OTU-H1
Ureaplasma_cluster23 -8382.795 -7954.621 0.7026208 0.6466340 0.1584385 0.0165980 133 39 Ureaplasma_cluster23
Veillonella_cluster61 -11907.828 -11479.654 NA 0.2620593 NA 0.0067139 111 148 Veillonella_cluster61
Veillonella_montpellierensis -14048.875 -13620.701 0.0577799 0.0511873 0.0069483 0.0035701 73 128 Veillonella_montpellierensis
Weeksella_virosa -13505.758 -13077.583 0.6554187 0.6517386 0.0105671 0.0041692 88 46 Weeksella_virosa

#make boxplot for the metrics and look at outliers, use rmse
#see rank of metrics for specific taxa of interest
#add column for quantiles for the specific taxa of interest
rmse <- ggplot(perf, aes(x="RMSE", y=RMSE)) + 
  geom_boxplot() +
  geom_point(data=subset(perf, rownames(perf) %in% new_list), 
    aes(x="RMSE", y=RMSE), 
    color="red", size=5) +
  geom_text_repel(data=subset(perf, rownames(perf) %in% new_list), aes(label=Name))
rmse

r2_cond <- ggplot(perf, aes(x="R2_Cond", y=R2_conditional)) + 
  geom_boxplot() +
  geom_point(data=subset(perf, rownames(perf) %in% new_list), 
    aes(x="R2_Cond", y=R2_conditional), 
    color="red", size=5) +
  geom_text_repel(data=subset(perf, rownames(perf) %in% new_list), aes(label=Name))
r2_cond

marrangeGrob(tax_plots, nrow = 2, ncol = 2)

marrangeGrob(tax_hists, nrow = 2, ncol = 2)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

iners_f_hist <- make_hists(iners_fixed, "Lactobacillus_iners", full_set, vars) + ggtitle("Iners Fixed Plot")
iners_m_hist <- make_hists(iners_mixed, "Lactobacillus_iners", full_set, mx_vars) + ggtitle("Iners Mixed Plot")
ggarrange(iners_f_hist, iners_m_hist)
crisp_f_hist <- make_hists(crisp_fixed, "Lactobacillus_crispatus_cluster", full_set, vars) + ggtitle("Crisp Fixed Plot")
crisp_m_hist <- make_hists(crisp_mixed, "Lactobacillus_crispatus_cluster", full_set, mx_vars) + ggtitle("Crisp Mixed Plot")
ggarrange(crisp_f_hist, crisp_m_hist)

#clean age, set blanks to NA, clean values, make 18-28 reference group
# do individual plots
# quantify prediction success
# get model summaries
# perform on more taxa
#education, high_school_ged is baseline, put in order of ascending degree
#predictions <- pred_plot(full_set, mod_Iners)
#lapply(models, model_plot)

mod_Crisp <- taxa.glm(full_set, "Lactobacillus_crispatus_cluster", "class", "fixed")
mod_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "class", "fixed")
mod_BVAB <- taxa.glm(full_set, "Lachnospiraceae_BVAB1", "class", "fixed")
mod_Gard <- taxa.glm(full_set, "Gardnerella_vaginalis", "class", "fixed")
mod_Gass <- taxa.glm(full_set, "Lactobacillus_gasseri_cluster", "class", "fixed")

prop_Crisp <- taxa.glm(full_set, "Lactobacillus_crispatus_cluster", "prop", "fixed")
prop_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "prop", "fixed")
prop_BVAB <- taxa.glm(full_set, "Lachnospiraceae_BVAB1", "prop", "fixed")
prop_Gard <- taxa.glm(full_set, "Gardnerella_vaginalis", "prop", "fixed")
prop_Gass <- taxa.glm(full_set, "Lactobacillus_gasseri_cluster", "prop", "fixed")

ggarrange(model_plot(mod_Crisp), model_plot(prop_Crisp),labels = c("Class Crisp", "Prop Crisp"))
ggarrange(model_plot(mod_Iners), model_plot(prop_Iners),labels = c("Class Iners", "Prop Iners"))
ggarrange(model_plot(mod_BVAB), model_plot(prop_BVAB),labels = c("Class BVAB", "Prop BVAB"))
ggarrange(model_plot(mod_Crisp), model_plot(prop_Iners),labels = c("Class Gard", "Prop Gard"))
ggarrange(model_plot(mod_Gass), model_plot(prop_Gass),labels = c("Class Gass", "Prop Gass"))
model_plot(mod_Crisp) + ggtitle("Class Crispatus")
model_plot(mod_Iners) + ggtitle("Class Iners")
model_plot(mod_BVAB) + ggtitle("Class BVAB")
model_plot(mod_Gard) + ggtitle("Class Gardnerella")
model_plot(mod_Gass) + ggtitle("Class Gasseri")

#alpha_div and vagitype show up in almost every model vaginal_pH in most of them
#correlations between data
#longitudinal plots when stratifying by covariates
#count and proportion plots
#Look at model pre and post aic
#ggaly and regular correlation matrix
#Look at distribution of NA's in covariate (pre-filtering) and see if it affects model by removing it
mixed_Iners <- taxa.glm(full_set, "Lactobacillus_crispatus_cluster", "mean", "mixed")
#Some linear combination of the parameters perfectly separates failure from successes, I assume it's vagitype
#Doesn't run still
na.test <-  function (x) {
  w <- sapply(x, function(x)all(is.na(x)))
  if (any(w)) {
    stop(paste("All NA in columns", paste(which(w), collapse=", ")))
  }
}
#Recode High to 1 and Low to 0
full_set[["class_Iners"]] <- recode(full_set[["class_Iners"]], "High"= 1, "Low"= 0)
full_set$class_Iners <- as.factor(full_set$class_Iners)


covs <- c("class_Iners", "vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "bv", "phys", "antibiotics_current","SUBJECT_ID")
model_data <- full_set[,covs]
no.na.data <- na.omit(model_data)

reduced_mod <- glmer(class_Iners ~ alpha_div + vagitype + (1|SUBJECT_ID), data = no.na.data, family = binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5))) #Unable to evaluate scaled gradient, model fails to converge
step.model <- stepAIC(log_mod, direction = "both", 
                      trace = FALSE)
#install.packages("ResourceSelection")
library(ResourceSelection)

fixed_mod <- glm(class_Iners ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + phys + bv + antibiotics_current, data = no.na.data, family = binomial) 
step.model <- stepAIC(fixed_mod, direction = "both", 
                      trace = FALSE)
step.back <- stepAIC(fixed_mod, direction = "backward")
hoslem.test(no.na.data$class_Iners, fitted(fixed_mod))
hoslem.test(no.na.data$class_Iners, fitted(step.model))
# model becomes class_Iners ~ alpha_div + vagitype
predict(fixed_mod, type = "response")
histogram(predict(fixed_mod, type = "response"))
plot(predict(fixed_mod, type = "response"), full_set$class_Iners)



plotting_dfm <- expand.grid(alpha_div = seq(0, 3.5, 0.01),
                           vagitype = c("Lactobacillus_crispatus_cluster", "Lactobacillus_iners", "Lachnospiraceae_BVAB1", "Gardnerella_vaginalis", "Lactobacillus_gasseri_cluster"))
plotting_dfm$preds <- predict(step.model, newdata=plotting_dfm, type = "response")
pl <- ggplot(plotting_dfm, aes(x=alpha_div, y =preds, color=as.factor(vagitype)))
pl + 
  geom_point( ) +
  ggtitle("Predicted High/Low by Alpha_Div and Vagitype")

#Look at alpha div and vagitype over trimesters and see how it changes 
# plot those who change
#Table of amt that change vs not change
reduced_mod <- glmer(class_Crisp ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + (1|SUBJECT_ID), data = full_set, family = binomial) #Does not converge
reduced_mod <- glmer(class_Gass ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + (1|SUBJECT_ID), data = full_set, family = binomial) #Does not converge
reduced_mod <- glmer(class_Gard ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + (1|SUBJECT_ID), data = full_set, family = binomial) #Does not converge
reduced_mod <- glmer(class_BVAB ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + (1|SUBJECT_ID), data = full_set, family = binomial) #Does not converge
library(caret)

#Split Data 80:20
training.samps <- full_set$class_Iners %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_Iners ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + SUBJECT_ID, data = full_set)
#summary(mda_model)
mda_model$percent.explained
mda_model$confusion
covs <- covs[which(names(pred) %in% rownames(full_set)),]
#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_Iners ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_Iners[which(names(pred) %in% rownames(full_set))])
#Random Forest
#try randomforest with formula and see if random var works
#install.packages("randomForest")
#library(randomForest)
#full_set$vagitype <- as.character(full_set$vagitype)
#full_set$SUBJECT_ID <- as.character(full_set$SUBJECT_ID)
#forest_model <- randomForest(class_Iners ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, na.action=na.omit, importance = TRUE)

#Split Data 80:20
training.samps <- full_set$class_Snea %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_Snea ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = train.data)
#summary(mda_model)
mda_model$percent.explained
mda_model$confusion
covs <- covs[which(names(pred) %in% rownames(full_set)),]
#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_Snea ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_Snea[which(names(pred) %in% rownames(full_set))])

#Split Data 80:20
training.samps <- full_set$class_Prev %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_Prev ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = train.data)
#summary(mda_model)
mda_model$percent.explained
covs <- covs[which(names(pred) %in% rownames(full_set)),]
mda_model$confusion

#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_Prev ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_Prev[which(names(pred) %in% rownames(full_set))])

#Split Data 80:20
training.samps <- full_set$class_TM %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_TM ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = train.data)
#summary(mda_model)
mda_model$percent.explained
covs <- covs[which(names(pred) %in% rownames(full_set)),]
mda_model$confusion
#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_TM ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_TM[which(names(pred) %in% rownames(full_set))])

#Split Data 80:20
training.samps <- full_set$class_BVAB %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_BVAB ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = train.data)
#summary(mda_model)
mda_model$percent.explained
covs <- covs[which(names(pred) %in% rownames(full_set)),]
mda_model$confusion
#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_BVAB ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_BVAB[which(names(pred) %in% rownames(full_set))])
---
title: "Modeling"
output: html_notebook
---
```{r}
rm(list=ls())
require(ggplot2)
#install.packages("pscl")
#install.packages(glmmTMB)
#install.packages("DescTools")
require(DescTools)
require(glmmTMB)
require(pscl)
require(boot)
library(data.table)
library(tidyr)
library(dplyr)
library(stringr)
library(here)
library(phyloseq)
library(vegan)
library(refund)
library(reshape2)
library(kableExtra)
library(lme4)
library(MASS)
library(forcats)
library(GGally)
library(lmerTest)
library(gridExtra)
library(ggfortify)
library(performance)
library(plotly)
library(ggrepel)
library(factoextra)
#remotes::install_github("palday/coefplot2",
#           subdir = "pkg")
library(coefplot2)
library(ggpubr)

#library(HMP2Data) #install the library from github
```


```{r}
#path to protected dbGap data location

local_path <- "~/Documents/MOMS-PI"# change me
#local_path <- "~/MOMS-PI"# change me

path2 <- file.path(local_path,"71694", "PhenoGenotypeFiles", "RootStudyConsentSet_phs001523.MOMS_PI.v1.p1.c1.DS-PREG-COM-IRB-PUB-MDS", "PhenotypeFiles") 

otu <- read.csv(file = file.path(path2, "MergedFiles", "OTU_table.csv"))
rownames(otu) <- otu$X
otu$X <- NULL 
t_otu <- as.data.frame(t(otu))
meta <- read.csv(file = file.path(path2, "MergedFiles", "Meta_data.csv"))
#physical activity variable
phys <- meta[,98:100]
meta$phys <- ifelse(meta$physical_activity_vigorous == "0_times" | is.na(meta$physical_activity_vigorous), "None", "Vigorous")
meta$phys[which(meta$phys == "None" & meta$physical_activity_moderate != "0_times" & !is.na(meta$physical_activity_moderate))] <- "Moderate"
meta$phys[which(meta$phys == "None" & meta$physical_activity_light != "0_times" & !is.na(meta$physical_activity_light))] <- "Light"
meta$phys <- as.factor(meta$phys)

#filter to just two races
meta <- meta %>%
  filter(race == "african_american" | race == "caucasian")
meta$race <- factor(meta$race)
rownames(meta) <- meta$X
meta$X <- NULL
t_otu <- t_otu[rownames(meta),]
```

```{r EditMeta}
meta <- meta %>%
  mutate(antibiotics_current = replace_na(antibiotics_current, "No"))

meta <- meta %>%
  mutate(infertility_treatment = replace_na(infertility_treatment, "No")) %>%
  mutate(infertility_treatment = recode(infertility_treatment, Not_sure = "No", .default = levels(meta$infertility_treatment)))

meta <- meta %>%
  mutate(vdischarge = replace_na(vdischarge, "No")) %>%
  mutate(vdischarge = recode(vdischarge, Not_Sure = "No", .default = levels(meta$vdischarge)))

meta <- meta %>%
  mutate(vodor = replace_na(vodor, "No")) %>%
  mutate(vodor = recode(vodor, Not_Sure = "No", .default = levels(meta$vodor)))

meta <- meta %>%
  mutate(vitching = replace_na(vitching, "No"))

meta <- meta %>%
  mutate(douche = replace_na(douche, "More_than_1_year_ago_or_never"))

meta <- meta %>%
  mutate(bv_history = replace_na(bv_history, "No")) %>%
  mutate(bv_history = recode(bv_history, Not_Sure = "No", .default = levels(meta$bv_history)))

meta <- meta %>%
  mutate(vaginitis_history = replace_na(vaginitis_history, "No"))

meta <- meta %>%
  mutate(current_medications = replace_na(current_medications, "No")) %>%
  mutate(current_medications = recode(current_medications, No_I_am_not_taking_any_new_medications = "No", .default = levels(meta$current_medications)))

meta <- meta %>%
  mutate(smoker_current = replace_na(smoker_current, "No"))

meta <- meta %>%
  mutate(alcohol_frequency_year = replace_na(alcohol_frequency_year, "0_times"))

meta <- meta %>%
  mutate(diabetes = replace_na(diabetes, "No")) %>%
  mutate(diabetes = recode(diabetes, Not_sure = "No", .default = levels(meta$diabetes)))

meta <- meta %>%
  mutate(no_false_labor = replace_na(no_false_labor, "Yes")) %>%
  mutate(no_false_labor = recode(no_false_labor, No_I_have_not_experienced_false_labor = "Yes", .default = levels(meta$no_false_labor)))

meta <- meta %>%
  mutate(no_high_bp = replace_na(no_high_bp, "Yes"))

meta <- meta %>%
  mutate(no_vbleeding = replace_na(no_vbleeding, "Yes"))

meta <- meta %>%
  mutate(progesterone = replace_na(progesterone, "No")) %>%
  mutate(progesterone = recode(progesterone, Not_Sure = "No", .default = levels(meta$progesterone)))

levels(meta$age)[1] <- NA
```


```{r }
#Create combined table
full_set <- cbind(t_otu, meta)
full_set$alpha_div <- diversity(full_set[,1:length(names(t_otu))], index="shannon")
full_set$num_reads <- rowSums(t_otu)

#Add vagitype
mypropdata <- full_set[,colnames(t_otu)]
mytypes <- apply(mypropdata,1,which.max)
maxprop <- as.numeric(mypropdata[matrix(c(1:nrow(mypropdata),mytypes), ncol=2)])
mytypes <- colnames(mypropdata)[mytypes]
mytypes[maxprop < 0.3] <- "No Type"
mytypes <- as.factor(mytypes)
full_set <- full_set %>%
  mutate(vagitype=mytypes)
high_vagitype <- sort(table(full_set$vagitype), decreasing=T)[1:5]
high_vagitype
#Add proportions
full_set$p_Iners <- full_set$Lactobacillus_iners/full_set$num_reads
full_set$p_Crisp <- full_set$Lactobacillus_crispatus_cluster/full_set$num_reads
full_set$p_Gard <- full_set$Gardnerella_vaginalis/full_set$num_reads
full_set$Gass <- full_set$Lactobacillus_gasseri_cluster/full_set$num_reads
full_set$p_BVAB <- full_set$Lachnospiraceae_BVAB1/full_set$num_reads

```





```{r}
get_mode = function(x, multimodal.na="TRUE"){
  modes <- Mode(x)
  if (multimodal.na=="FALSE" | length(modes)==1) {
    return(modes[1]) 
  } else {
    return(modes[length(modes)+1])
  }
}

#582 subjects
#aggregate at patient level
demog <- aggregate(visit_number ~ SUBJECT_ID, data = meta%>%data.frame, FUN = min)
demog <- merge(demog,meta%>%data.frame, by = c("SUBJECT_ID", "visit_number") )
  
#number visits per subject 
df <- aggregate(visit_number ~ SUBJECT_ID, data = meta%>%data.frame, FUN = length)
colnames(df)[which(colnames(df) == "visit_number")] <- "number of visits"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average income per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(income = get_mode(income, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "income")] <- "average income"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average education per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(education = get_mode(education, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "education")] <- "average education"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average age per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(age = get_mode(age, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "age")] <- "average age"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average prenatal care start per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(prenatal_care_start = get_mode(prenatal_care_start, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "prenatal_care_start")] <- "average p_care_start"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average infertility treatment per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(infertility_treatment = get_mode(infertility_treatment, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "infertility_treatment")] <- "average inf_trt"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average v_discharge per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(vdischarge = get_mode(vdischarge, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "vdischarge")] <- "average v_discharge"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average v_odor per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(vodor = get_mode(vodor, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "vodor")] <- "average v_odor"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average v_itching per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(vitching = get_mode(vitching, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "vitching")] <- "average v_itching"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average douche per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(douche = get_mode(douche, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "douche")] <- "average douche"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average bv_history per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(bv_history = get_mode(bv_history, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "bv_history")] <- "average bv_history"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average vaginitis_history per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(vaginitis_history = get_mode(vaginitis_history, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "vaginitis_history")] <- "average vaginitis_history"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average current_medications per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(current_medications = get_mode(current_medications, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "current_medications")] <- "average current_medications"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average smoker_current per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(smoker_current = get_mode(smoker_current, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "smoker_current")] <- "average smoker_current"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average alcohol_frequency per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(alcohol_frequency_year = get_mode(alcohol_frequency_year, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "alcohol_frequency_year")] <- "average alcohol_frequency"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average diabetes per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(diabetes = get_mode(diabetes, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "diabetes")] <- "average diabetes"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average false_labor per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(no_false_labor = get_mode(no_false_labor, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "no_false_labor")] <- "average no_false_labor"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average high_bp per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(no_high_bp = get_mode(no_high_bp, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "no_high_bp")] <- "average no_high_bp"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average v_bleeding per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(no_vbleeding = get_mode(no_vbleeding, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "no_vbleeding")] <- "average no_vbleeding"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average progesterone per subject 
df <- meta %>% group_by(SUBJECT_ID) %>% summarise(progesterone = get_mode(progesterone, multimodal.na = "FALSE"))
colnames(df)[which(colnames(df) == "progesterone")] <- "average progesterone"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#average vaginal_ph across visits
#Some patients have na so goes to 568
df <- aggregate(vaginal_pH ~ SUBJECT_ID, data = meta%>%data.frame, FUN = mean)
colnames(df)[which(colnames(df) == "vaginal_pH")] <- "mean_vaginal_ph"
demog <- merge(demog,df, by = c("SUBJECT_ID") )
#alpha diversity
df <- aggregate(alpha_div ~ SUBJECT_ID, data = full_set%>%data.frame, FUN = mean)
colnames(df)[which(colnames(df) == "alpha_div")] <- "mean_alpha_div"
demog <- merge(demog,df, by = c("SUBJECT_ID") )

#reported_ga is only for minimum recorded visit for each individual
Varnames <- c("mean_vaginal_ph", "number of visits", "reported_ga", "mean_alpha_div", "average income", 
              "average education", "average age", "average p_care_start", "average inf_trt", 
              "average  v_discharge", "average v_odor", "average v_itching", "average douche", 
              "average bv_history", "average vaginitis_history", "average current_medications",
              "average smoker_current", "average alcohol_frequency", "average diabetes", 
              "average no_false_labor", "average no_high_bp", "average no_vbleeding", "average progesterone")
#racial breakdown, break down visits and other demographics
stratified1 = tableone::CreateTableOne(
  vars = Varnames[1:4],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
stratified2 = tableone::CreateTableOne(
  vars = Varnames[5:7],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
stratified3 = tableone::CreateTableOne(
  vars = Varnames[8:15],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
stratified4 = tableone::CreateTableOne(
  vars = Varnames[16:19],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)
stratified5 = tableone::CreateTableOne(
  vars = Varnames[20:23],
  data = summarytools::unlabel(demog), strata = "race", includeNA = TRUE)

```

```{r }
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified1
#fix the part where every race is shown, probably due to meta being called
```

```{r }
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified2
#fix the part where every race is shown, probably due to meta being called
```

```{r }
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified3
#fix the part where every race is shown, probably due to meta being called
```

```{r }
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified4
#fix the part where every race is shown, probably due to meta being called
```

```{r }
#stratified1 <- print(stratified1, printToggle = FALSE, showAllLevels = FALSE)
#stratified1 = stratified1[,!(colnames(stratified1) %in% "test")]%>%
#  knitr::kable(format="latex")%>% 
#  kable_styling("striped", full_width = F, font_size = 9)
stratified5
#fix the part where every race is shown, probably due to meta being called
```

```{r}
#Find out individuals who have a different vagitype across visits
uniq_vagitype <- full_set %>%
  group_by(SUBJECT_ID) %>%
  summarize(n_unique = n_distinct(vagitype))
more_vagitype <- uniq_vagitype %>%
  filter(n_unique > 1)
full_set %>%
  filter(SUBJECT_ID %in% more_vagitype$SUBJECT_ID) %>%
  group_by(SUBJECT_ID) %>%
  distinct(vagitype)

#Look at proportional changes over time, look at differences betweeen those that did change and those that did not
#Look at how they change, if they change similarly
```



```{r}

count_table <- aggregate(list(full_set$Lactobacillus_iners, full_set$p_Iners,full_set$Lactobacillus_crispatus_cluster, full_set$p_Crisp, full_set$Gardnerella_vaginalis, full_set$p_Gard, full_set$Lactobacillus_gasseri_cluster, full_set$Gass, full_set$Lachnospiraceae_BVAB1, full_set$p_BVAB), by = list(full_set$race, full_set$reported_ga), mean)
colnames(count_table) <- c("race", "reported_ga", "Lactobacillus_iners", "p_Iners", "Crispatus", "p_Crisp", 
                           "Gardnerella", "p_Gard", "Gasseri", "p_Gass", "BVAB", "p_BVAB")
count_table <- count_table %>%
  arrange(race)
kable(count_table,  table.attr = "style = \"color: black;\"") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
```

```{r }
#Plot proportions over time, plot count over time
iners <- ggplot(count_table, aes(reported_ga, p_Iners)) + 
  geom_line(aes(group=race, color=race)) 
crisp <- ggplot(count_table, aes(reported_ga, p_Crisp)) + 
  geom_line(aes(group=race, color=race)) 
gard <- ggplot(count_table, aes(reported_ga, p_Gard)) + 
  geom_line(aes(group=race, color=race)) 
gass <- ggplot(count_table, aes(reported_ga, p_Gass)) + 
  geom_line(aes(group=race, color=race)) 
bvab <- ggplot(count_table, aes(reported_ga, p_BVAB)) + 
  geom_line(aes(group=race, color=race)) 
library(gridExtra)
ggarrange(iners, crisp, bvab, gard, gass, nrow = 3)
#Plot individuals for each race
#ggplot(full_set, aes(reported_ga, p_Iners)) + 
#  geom_line(aes(group=SUBJECT_ID, color=race)) + 
#  facet_wrap(~race, ncol=1)


#Look at those with the specific taxa as their vagitype
```


```{r}
prop_plot <- ggplot(full_set, aes(p_Iners)) + geom_histogram() + geom_vline(xintercept=mean(full_set$p_Iners), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$p_Iners), lwd=1, linetype=2, color="green")
#prop_plot
cprop_plot <- ggplot(full_set, aes(p_Crisp)) + geom_histogram() + geom_vline(xintercept=mean(full_set$p_Crisp), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$p_Crisp), lwd=1, linetype=2, color="green")
#iprop_plot

bprop_plot <- ggplot(full_set, aes(p_BVAB)) + geom_histogram() + geom_vline(xintercept=mean(full_set$p_BVAB), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$p_BVAB), lwd=1, linetype=2, color="green")
#pprop_plot

gprop_plot <- ggplot(full_set, aes(p_Gard)) + geom_histogram() + geom_vline(xintercept=mean(full_set$p_Gard), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$p_Gard), lwd=1, linetype=2, color="green")
#tprop_plot

gasprop_plot <- ggplot(full_set, aes(Gass)) + geom_histogram() + geom_vline(xintercept=mean(full_set$Gass), lwd=1, linetype=2, color="red") + geom_vline(xintercept=median(full_set$Gass), lwd=1, linetype=2, color="green")
#bprop_plot
#stratify by race
ggarrange(prop_plot, cprop_plot, bprop_plot, gprop_plot, gasprop_plot, labels = c("Iners","Crisp", "BVAB", "Gard", "Gass"), ncol = 3, nrow = 2)

```

```{r}
race_prop_plot <- ggplot(full_set, aes(p_Iners)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race) 

crace_prop_plot <- ggplot(full_set, aes(p_Crisp)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race) 

#stratify by race
brace_prop_plot <- ggplot(full_set, aes(p_BVAB)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race) 

grace_prop_plot <- ggplot(full_set, aes(p_Gard)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race)

gassrace_prop_plot <- ggplot(full_set, aes(Gass)) + geom_histogram(aes(group=race, fill=race)) + facet_wrap(~race)

ggarrange(race_prop_plot, crace_prop_plot, brace_prop_plot, grace_prop_plot, gassrace_prop_plot, nrow = 3)

```


```{r Count}
#Zero-Inflated Poisson Model
z <- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + (1|SUBJECT_ID), data = full_set, family = poisson, zi = ~ vaginal_pH + reported_ga + alpha_div)
summary(z)
hist(predict(z))
plot(full_set$Lactobacillus_iners[which(names(linpred) %in% rownames(full_set))], linpred)

#summary(m1 <- zeroinfl(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div |SUBJECT_ID, data = full_set))
z <- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + (1|SUBJECT_ID), data = full_set, family = poisson, zi = ~ 0)
summary(z)
hist(predict(z))


#Zero-Inflated Negative Binomial Model
neg_bin<- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + offset(log(num_reads)) + (1|SUBJECT_ID), data = full_set, family = nbinom2, zi = ~ vaginal_pH + reported_ga + alpha_div)
summary(neg_bin)
predict(neg_bin)
neg_binone <- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + offset(log(num_reads)) + (1|SUBJECT_ID), data = full_set, family = nbinom2, zi = ~ 1)
summary(neg_binone)
hist(predict(neg_binone))
#Zero-Inflated Beta-Binomial Model
#kostic.y <- cbind(1, kostic.y=="Tumor")
beta_bin<- glmmTMB(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set, family=betabinomial(link = "logit"), zi = ~ vaginal_pH + reported_ga + alpha_div + race)


#Mixed Linear Regression Model
mixedL <- lmer(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set)
summary(mixedL)
linpred <- predict(mixedL)
#predicted
#x-axis people
#y-axis visits for that person
plot(full_set$Lactobacillus_iners[which(names(linpred) %in% rownames(full_set))], linpred)


## Beta bimonial zero inflated ##

## NBZIMM ##
f21 = glmm.zinb(Lactobacillus_iners ~ vaginal_pH + reported_ga + alpha_div , random = ~ 1|SUBJECT_ID, data = full_set)
summary(f21)
summary(f21)$tTable[4, 5]

```

```{r ProportionsData}
hist(full_set$p_Iners)
beta_bin<- glmmTMB(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set, family=betabinomial(link = "logit"), zi = ~ vaginal_pH + reported_ga + alpha_div + race)
mean(full_set$p_Iners)
var(full_set$p_Iners)

library(zoib)
#zoib
#zoib(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race | 1|1, data = full_set, random = 1, EUID = full_set$SUBJECT_ID)
library(aod)
full_set$p_Iners[which(full_set$p_Iners == 1)] <- 0.99999
betaIners <- glmmTMB(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set, family=beta_family(link = "logit"), zi = ~ 1)
betapred <- predict(betaIners)
summary(betaIners)
#predicted
#x-axis people
#y-axis visits for that person
#plot(full_set$p_Iners[which(names(betapred) %in% rownames(full_set))], betapred)

#zoib(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race | 1, data = full_set)


mixedL <- lmer(p_Iners ~ vaginal_pH + reported_ga + alpha_div + race + (1|SUBJECT_ID), data = full_set)

#par(mfrow = c(2,2))

coefplot2(mixedL)
plot(mixedL)
linpred <- predict(mixedL)

#predicted
#x-axis people
#y-axis visits for that person
plot(full_set$p_Iners[which(names(linpred) %in% rownames(full_set))], linpred)
```


```{r}
full_set$class_Iners <- as.factor(ifelse(full_set$p_Iners > mean(full_set$p_Iners), "High", "Low"))
full_set$class_Crisp <- as.factor(ifelse(full_set$p_Crisp > mean(full_set$p_Crisp), "High", "Low"))
full_set$class_Gass <- as.factor(ifelse(full_set$Gass > mean(full_set$Gass), "High", "Low"))
full_set$class_Gard <- as.factor(ifelse(full_set$p_Gard > mean(full_set$p_Gard), "High", "Low"))
full_set$class_BVAB <- as.factor(ifelse(full_set$p_BVAB > mean(full_set$p_BVAB), "High", "Low"))


#Check median
#add line to show where each point is
```


```{r}
#install.packages("ggpubr")
iners_plot <- ggplot(full_set, aes(class_Iners)) + geom_bar()
snea_plot <- ggplot(full_set, aes(class_Crisp)) + geom_bar() 
tm_plot <- ggplot(full_set, aes(class_Gass)) + geom_bar() 
prev_plot <- ggplot(full_set, aes(class_Gard)) + geom_bar() 
bvab_plot <- ggplot(full_set, aes(class_BVAB)) + geom_bar()

ggarrange(iners_plot, snea_plot, tm_plot, prev_plot, bvab_plot, labels = c("Iners", "Crisp", "Gass", "Gard", "BVAB1"), ncol = 3, nrow = 2)


```

```{r}
#Distribution of High and Low (do it across race)
irace_prop_plot <- ggplot(full_set, aes(class_Iners)) + geom_bar(aes(y= stat(prop),group=race, fill=race), position = "dodge2")+ ggtitle("Iners Dichotomized Histogram")
irace_prop_plot

#Distribution of High and Low (do it across race)
srace_prop_plot <- ggplot(full_set, aes(class_Crisp)) + geom_bar(aes(y= stat(prop), group=race, fill=race), position = "dodge2") + ggtitle("Crisp Dichotomized Histogram")


#Distribution of High and Low (do it across race)
prace_prop_plot <- ggplot(full_set, aes(class_Gass)) + geom_bar(aes(y= stat(prop), group=race, fill=race), position = "dodge2") +  ggtitle("Gass Dichotomized Histogram")


#Distribution of High and Low (do it across race)
trace_prop_plot <- ggplot(full_set, aes(class_Gard)) + geom_bar(aes(y= stat(prop), group=race, fill=race), position = "dodge2") + ggtitle("Gard Dichotomized Histogram")


#Distribution of High and Low (do it across race)
brace_prop_plot <- ggplot(full_set, aes(class_BVAB)) + geom_bar(aes(y= stat(prop),group=race, fill=race), position = "dodge2") + ggtitle("BVAB Dichotomized Histogram")


ggarrange(irace_prop_plot, srace_prop_plot, prace_prop_plot, trace_prop_plot, brace_prop_plot, nrow = 3)
#Use proportions instead of counts for better comparison, DONE
#position dodge, DONE
#Add proportion values to top of bars
#do facet wrap by taxa
```

```{r}
covs.taxa <- function(df, taxon, model, vars){
  tax_prop <- df[,taxon]/df[,"num_reads"]
   if(model == "fixed"){
    #covs <- vars
    model_data <- df[,vars]
    half <- paste(vars, collapse= "+")
  }
  else{
    #covs <- vars
    model_data <- df[,c(vars, "SUBJECT_ID", "num_reads")]
    half <- paste(paste(vars, collapse= "+"), " + (1|SUBJECT_ID)")
  }
  model_data <- cbind(tax_prop, model_data)
  no.na.data <- na.omit(model_data)
  form <- as.formula(paste("tax_prop ~ ", half))
  if(model == "fixed"){
    mod <- lm(form, data = no.na.data)
    step.back <- stepAIC(mod, direction = "backward")
    return(step.back)
    #return(mod)
  }
  else{
    mod <- lmer(form, data = no.na.data)
    return(mod)
    #return(mod.step)
  }
}


```


```{r }
taxa.glm <- function(df, taxon, type, model){
  tax_count <- df[,taxon]
  tax_prop <- df[,taxon]/df[,"num_reads"]
  class_tax <- as.factor(ifelse(tax_prop > mean(tax_prop), "High", "Low"))
  class_tax <- recode(class_tax, "High"= 1, "Low"= 0)
  if(model == "fixed"){
    covs <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "bv_history", "phys", "antibiotics_current","age", "education", "prenatal_care_start", "infertility_treatment", "vdischarge", "vodor", "vitching", "douche", "vaginitis_history", "current_medications", "smoker_current", "alcohol_frequency_year", "diabetes", "no_false_labor", "no_high_bp", "no_vbleeding", "progesterone")
    model_data <- df[,covs]
    half <- paste(covs, collapse= "+")
  }
  else{
    covs <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "bv_history", "phys", "antibiotics_current","age", "education", "prenatal_care_start", "infertility_treatment", "vdischarge", "vodor", "vitching", "douche", "vaginitis_history", "current_medications", "smoker_current", "alcohol_frequency_year", "diabetes", "no_false_labor", "no_high_bp", "no_vbleeding", "progesterone")
    model_data <- df[,c(covs, "SUBJECT_ID", "num_reads")]
    half <- paste(paste(covs, collapse= "+"), " + (1|SUBJECT_ID)")
  }
  if(type == "count"){
    model_data <- cbind(tax_count, model_data)
    no.na.data <- na.omit(model_data)
    form <- as.formula(paste("tax_count ~ ", half))
    if(model == "fixed"){
      mod <- glm.nb(form, data = no.na.data)
    }
    else{
      mod <- glmer.nb(form, data = no.na.data)
    }
  }
  else if(type == "prop"){
    model_data <- cbind(tax_prop, model_data)
    no.na.data <- na.omit(model_data)
    form <- as.formula(paste("tax_prop ~ ", half))
    if(model == "fixed"){
      mod <- lm(form, data = no.na.data)
      #return(mod)
    }
    else{
      mod <- lmer(form, data = no.na.data)
      return(mod)
      #return(mod.step)
    }
  }
  else{
    model_data <- cbind(class_tax, model_data)
    no.na.data <- na.omit(model_data)
    form <- as.formula(paste("class_tax ~ ", paste(covs, collapse= "+")))
    if(model == "fixed"){
      mod <- glm(form, data = no.na.data, family = binomial)
    }
    else{
      mod <- glmer(form, data = no.na.data, family = binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
    }
  }
  #step.model <- stepAIC(fixed_mod, direction = "both", trace = FALSE)
  step.back <- stepAIC(mod, direction = "backward", trace = FALSE)
  return(step.back)
  #TO ADD: Plot for each covariate the predicted distribution
  #Output table of common variables, extract p-value
}
```

```{r }
model_plot <- function(model){
  final_covs <- names(model$model)[-1]
  print(deparse(substitute(model)))
  print(as.formula(paste("class_tax ~ ", paste(final_covs, collapse= "+"))))
  #Plot AIC for model
  vars <- unlist(lapply(strsplit(as.character(model$anova$Step), '- ', fixed = TRUE), '[', 2))
  vars[1] <- "Intercept"
  AIC.res <- data.frame(Variable = vars, AIC = model$anova$AIC)
  AIC.res$Variable <- factor(AIC.res$Variable, levels = vars)
  #true variables
  #true_vars <- paste0("var", 1:10)
  #colour_vars <- ifelse(AIC.res$Variable %in% true_vars, 'red','black')
  plot <- ggplot(data=AIC.res, aes(x=Variable, y=AIC, group=1)) +
    geom_line(color = "red")+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 60, hjust = 1))
  return(plot)
}
```

```{r }
pred_plot <- function(df, model){
  model_data <- df[,covs]
  model_data <- cbind(class_tax, model_data)
  no.na.data <- na.omit(model_data)
  final_covs <- names(model$model)[-1]
  pred_data <- no.na.data[,final_covs]
  return(predict(model, newdata=pred_data, type = "response"))
  
  
  
  # if(c("alpha_div","vagitype") %in% names(model$model)){
  #   plotting_dfm <- expand.grid(alpha_div = seq(0, 3.5, 0.01),
  #                          vagitype = c("Lactobacillus_crispatus_cluster", "Lactobacillus_iners", "Lachnospiraceae_BVAB1", "Gardnerella_vaginalis", "Lactobacillus_gasseri_cluster"))
  #   plotting_dfm$preds <- predict(model, newdata=plotting_dfm, type = "response")
  #   pl <- ggplot(plotting_dfm, aes(x=alpha_div, y =preds, color=as.factor(vagitype))) +
  #     geom_point( ) +
  #     ggtitle("Predicted High/Low by Alpha_Div and Vagitype")
  #   return(p1)
  # }
  # else{
  #   return ("No value")
  # }
}

```

```{r}
get_data <- function(taxon, df){
  tax_prop <- df[,taxon]/df[,"num_reads"]
  covs <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "bv_history", "phys", "antibiotics_current","age", "education", "prenatal_care_start", "infertility_treatment", "vdischarge", "vodor", "vitching", "douche", "vaginitis_history", "current_medications", "smoker_current", "alcohol_frequency_year", "diabetes", "no_false_labor", "no_high_bp", "no_vbleeding", "progesterone")
  model_data <- df[,c(covs, "SUBJECT_ID", "num_reads")]
  model_data <- cbind(tax_prop, model_data)
  no.na.data <- na.omit(model_data)
  return(no.na.data)
}
```

```{r}
make_hists<- function(model, taxon, df, vars){
  hists <- data.frame(xx = c(predict(model, newdata = get_data(taxon, df)[, vars]), get_data(taxon, df)[, "tax_prop"]), yy = rep(c("pred", "actual"),each = length(get_data(taxon, df)[, "tax_prop"])))
  return(ggplot(hists, aes(x=xx, fill=yy)) + geom_histogram(alpha=0.2, position="identity"))
}

```

```{r}
agg_plot <- function(model, taxon, df, vars){
  ex <- data.frame(data = c(predict(model, newdata = get_data(taxon, df)[, vars]), get_data(taxon, df)[, "tax_prop"]), 
                   race = get_data(taxon, df)[, "race"], tri = get_data(taxon, df)[, "reported_ga"],
                   yy = rep(c("pred", "actual"),each = length(get_data(taxon, df)[, "tax_prop"])))
  agg_table <- aggregate(list(ex$data), by = list(ex$race, ex$tri, ex$yy), mean)
  colnames(agg_table) <- c("race", "tri", "type", "data")
  agg_plot <- ggplot(agg_table, aes(tri, data)) + 
geom_line(aes(group=interaction(race, type), color=race, linetype = type)) + 
    scale_x_discrete(labels=c("First Trimester" = "1st", "Second Trimester" = "2nd",
                              "Third Trimester" = "3rd")) + 
    ylab("Prop")
  return(agg_plot)
}

```

```{r}
getPerformace <- function(model, df){
  taxa <- colnames(t_otu)
  example <- c("Lactobacillus_iners", "Lactobacillus_crispatus_cluster", "Gardnerella_vaginalis", "Lachnospiraceae_BVAB1", "Lactobacillus_gasseri_cluster", "Sneathia_amnii", "Prevotella_cluster2", "TM7_OTU-H1")
  for (name in example){
    md <- covs.taxa
  }
  perf <- model_performance(model, metrics = "all", verbose = TRUE)
  return(perf)
  
}


```

```{r}
subsetFull <- function(df, taxalist){
  `%ni%` <- Negate(`%in%`)
  cols <- sapply(taxalist, function (x) any(df[,x]/df$num_reads > 0.05))
  df <- subset(df,select = names(df) %ni% names(which(cols == FALSE)))
  return(df)
}
  

```

```{r}
covs <- c("vaginal_pH", "alpha_div", "reported_ga", "race", "income", "bv_history", "phys", "antibiotics_current","age", "education", "prenatal_care_start", "infertility_treatment", "vdischarge", "vodor", "vitching", "douche", "vaginitis_history", "current_medications", "smoker_current", "alcohol_frequency_year", "diabetes", "no_false_labor", "no_high_bp", "no_vbleeding", "progesterone")
model_data <- full_set[,covs]
ggpairs(model_data, columns = 1:2, ggplot2::aes(colour=race)) #too many points, too many vagitypes as well so had to remove
#do top 5 vagitypes
aa <- full_set[full_set$race == 'african_american',]
histogram(aa$vaginal_pH)
```


```{r CompareModelType}
count_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "count", "fixed") #Does not converge
prop_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "prop", "fixed")
class_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "class", "fixed")
mcount_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "count", "mixed") #Does not converge
mprop_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "prop", "mixed") #failure to converge in 10000 evaluations
mod_Crisp <- taxa.glm(full_set, "Lactobacillus_crispatus_cluster", "class", "fixed")
mod_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "class", "fixed")
mod_BVAB <- taxa.glm(full_set, "Lachnospiraceae_BVAB1", "class", "fixed")
mod_Gard <- taxa.glm(full_set, "Gardnerella_vaginalis", "class", "fixed")
mod_Gass <- taxa.glm(full_set, "Lactobacillus_gasseri_cluster", "class", "fixed")


taxalist <- c("Lactobacillus_crispatus_cluster", "Lactobacillus_iners", "Lachnospiraceae_BVAB1", "Gardnerella_vaginalis", "Lactobacillus_gasseri_cluster")
var_df <- data.frame(matrix(ncol = 15, nrow = 15))
#colnames(var_df) <- taxalist
count <- 0
for(ind in c(1:length(taxalist))){
  mods <- taxa.glm(full_set, taxalist[ind], "prop", "mixed")
  no.na.data <- get_data(taxalist[ind], full_set)
  step.back <- lmerTest::step(mods)
  fix_vars <- c(rownames(step.back$fixed[which(step.back$fixed$Eliminated == 0),]), rownames(step.back$random[which(step.back$random$Eliminated == 0),]))
  for(i in c(1:length(fix_vars))){
    var_df[i,ind+count] <- fix_vars[i]
  }
  fixed_mods <- taxa.glm(full_set, taxalist[ind], "prop", "fixed")
  fixed_vars <- names(fixed_mods$model)[-1]
  for(i in c(1:length(fixed_vars))){
    var_df[i,ind+1+count] <- fixed_vars[i]
  }
  class_mods <- taxa.glm(full_set, taxalist[ind], "class", "fixed")
  class_vars <- names(class_mods$model)[-1]
  for(i in c(1:length(class_vars))){
    var_df[i,ind+2+count] <- class_vars[i]
  }
  count <- count + 2
}

colnames(var_df) <- c("Crisp_P_M", "Crisp_P_F", "Crisp_C_F", "Iners_P_M", "Iners_P_F","Iners_C_F", "BVAB1_P_M", "BVAB1_P_F","BVAB1_C_F","Gard_P_M", "Gard_P_F", "Gard_C_F","Gass_P_M","Gass_P_F", "Gass_C_F")
options(knitr.kable.NA = '')
kable(var_df, table.attr = "style = \"color: black;\"") %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  add_header_above(c("Crispatus" = 3, "Iners" = 3, "BVAB" = 3, "Gardnerella" = 3, "Gasseri" = 3))

```

```{r}
#Make table for the variables that are outputted
#need the data frame
#alpha div, vagitype, age, smoking, education, subject_id, vaginal_ph, race
#look at covariate estimates for each taxa model, consider correlations in model
#forward model selection, one variable addition at a time
#look at differences between taxa
#look for vagitypes in rare taxa
#performance with and without vagitype, compare using model comparison,ANOVA DONE
#overlay averages across each race across the 3 trimesters, output model metrics, try without and with vagitype: WROTE FUNCTION, APPLY TO ALL TAXA
#how much does vagitype add in terms of prediction, PLOTS DONE
#make summaries
#pca under both smoothed and unsmoother conditions
#prcomp on raw data, scale variables to same unit

vars <- c("vaginal_pH", "alpha_div", "race", "vagitype", "age", "education", "smoker_current")
mx_vars<- c("vaginal_pH", "alpha_div", "race", "vagitype", "age", "education", "smoker_current", "reported_ga","SUBJECT_ID")
mx_type <- c("vaginal_pH", "alpha_div", "race", "age", "education", "smoker_current", "reported_ga","SUBJECT_ID")
no_type <- c("vaginal_pH", "alpha_div", "race", "age", "education", "smoker_current")

#predict based on these models

taxalist <- c("Lactobacillus_iners", "Lactobacillus_crispatus_cluster", "Gardnerella_vaginalis", "Lachnospiraceae_BVAB1", "Lactobacillus_gasseri_cluster", "Sneathia_amnii", "Prevotella_cluster2", "TM7_OTU-H1")
tax_plots <- list()
tax_hists <- list()

#taxalist <- c("Lactobacillus_iners", "Lactobacillus_crispatus_cluster")
for (taxa in taxalist){
 mixed_full <- covs.taxa(full_set, taxa, "mixed", vars)
 mixed_notype <- covs.taxa(full_set, taxa, "mixed", no_type)
 print(taxa)
 print(anova(mixed_full, mixed_notype))
 #print(summary(mixed_notype))
 var = paste(taxa, "Full", sep = "_")
 var2 = paste(taxa, "NoType", sep = "_")
 tax_plots[[var]] <- agg_plot(mixed_full, taxa, full_set, mx_vars) + ggtitle(var)
 tax_plots[[var2]] <- agg_plot(mixed_notype, taxa, full_set, mx_type) + ggtitle(var2)
 tax_hists[[var]] <- make_hists(mixed_full, taxa, full_set, mx_vars) + ggtitle(var)
 tax_hists[[var2]] <- make_hists(mixed_notype, taxa, full_set, mx_type) + ggtitle(var2)
}
new_list = c("Lactobacillus_iners", "Lactobacillus_crispatus_cluster", "Gardnerella_vaginalis", "Lachnospiraceae_BVAB1", "Lactobacillus_gasseri_cluster")
perf <- data.frame()
predFrame <- data.frame()
see <- subsetFull(full_set, names(t_otu))
updated <- t_otu[,(names(t_otu) %in% names(see))]

#comb <- data.frame()
for (taxa in names(updated)){
  mixed_full <- covs.taxa(see, taxa, "mixed", vars)
  if (taxa %in% new_list){
    orig <- paste(taxa, "Orig", sep = "_")
    predicted <- paste(taxa, "Pred", sep = "_")
    pre_dat <- getData(mixed_full)
    pred <- predict(mixed_full)
    comb[1:length(pred),c(orig, predicted)] <- cbind(pre_dat[,c("tax_prop", "SUBJECT_ID")], pred)
    # comb <- comb %>%
    #   rename(!!orig := tax_prop, !!predicted := pred)
  }
  var = paste(taxa, "Full", sep = "_")
  predFrame[1:length(predict(mixed_full)),taxa] <- predict(mixed_full)
  perf[taxa,names(model_performance(mixed_full, metrics = "all", verbose = TRUE))] <- model_performance(mixed_full, metrics = "all", verbose = TRUE)
}
predFrame[predFrame < 0] <- 0
predFrame[predFrame > 1] <- 1
comb[comb < 0] <- 0
comb[comb > 1] <- 1
comb <- comb[ , c(2, 1, 3:ncol(comb))]
no_na <- na.omit(full_set[, vars])
pred_Full <- cbind(predFrame, no_na)
#Get the actual from the initial model
#Add in subject_id
#Truncate, DONE
```

```{r}
perf$rankRMSE <- rank(perf$RMSE)
perf$rankR2Cond <- rank(-perf$R2_conditional)
perf$Name <- rownames(perf)
kable(perf,  table.attr = "style = \"color: black;\"") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
#make boxplot for the metrics and look at outliers, use rmse
#see rank of metrics for specific taxa of interest
#add column for quantiles for the specific taxa of interest

rmse <- ggplot(perf, aes(x="RMSE", y=RMSE)) + 
  geom_boxplot() +
  geom_point(data=subset(perf, rownames(perf) %in% new_list), 
    aes(x="RMSE", y=RMSE), 
    color="red", size=5) +
  geom_text_repel(data=subset(perf, rownames(perf) %in% new_list), aes(label=Name))
rmse

r2_cond <- ggplot(perf, aes(x="R2_Cond", y=R2_conditional)) + 
  geom_boxplot() +
  geom_point(data=subset(perf, rownames(perf) %in% new_list), 
    aes(x="R2_Cond", y=R2_conditional), 
    color="red", size=5) +
  geom_text_repel(data=subset(perf, rownames(perf) %in% new_list), aes(label=Name))
r2_cond
```

```{r PredActual}
grouped <- aggregate(. ~ SUBJECT_ID, comb, mean)
grouped$rows <- as.numeric(rownames(grouped))

sorted <- comb %>%
  group_by(SUBJECT_ID) %>%
  filter(n() > 7)
sorted <- sorted %>%
  group_by(SUBJECT_ID) %>%
  mutate(id = row_number())
sorted <- ungroup(sorted)


ins = ggplot() + 
  geom_line(data = sorted, aes(x=id, y=Lactobacillus_iners_Orig,group=as.factor(SUBJECT_ID)), color = "blue") +
  geom_line(data = sorted, aes(x=id, y=Lactobacillus_iners_Pred,group=as.factor(SUBJECT_ID)), color = "red") +
  xlab('Number of Visits') +
  ylab('Prop_Iners') +
  ggtitle("Lactobacillus iners over time")
ins <- ggplotly(ins)
ins

crp = ggplot() + 
  geom_line(data = sorted, aes(x=id, y=Lactobacillus_crispatus_cluster_Orig,group=SUBJECT_ID), color = "blue") +
  geom_line(data = sorted, aes(x=id, y=Lactobacillus_crispatus_cluster_Pred,group=SUBJECT_ID), color = "red") +
  xlab('Number of Visits') +
  ylab('Prop_Crisp') +
  ggtitle("Lactobacillus crispatus over time")
crp <- ggplotly(crp)
crp

grd = ggplot() + 
  geom_line(data = sorted, aes(x=id, y=Gardnerella_vaginalis_Orig,group=SUBJECT_ID), color = "blue") +
  geom_line(data = sorted, aes(x=id, y=Gardnerella_vaginalis_Pred,group=SUBJECT_ID), color = "red") +
  xlab('Number of Visits') +
  ylab('Prop_Gard') +
  ggtitle("Gardnerella vaginalis over time")
grd <- ggplotly(grd)
grd

bvb = ggplot() + 
  geom_line(data = sorted, aes(x=id, y=Lachnospiraceae_BVAB1_Orig,group=SUBJECT_ID), color = "blue") +
  geom_line(data = sorted, aes(x=id, y=Lachnospiraceae_BVAB1_Pred,group=SUBJECT_ID), color = "red") +
  xlab('Number of Visits') +
  ylab('Prop_BVAB1') +
  ggtitle("BVAB1 over time")
bvb <- ggplotly(bvb)
bvb

gas = ggplot() + 
  geom_line(data = sorted, aes(x=id, y=Lactobacillus_gasseri_cluster_Orig,group=SUBJECT_ID), color = "blue") +
  geom_line(data = sorted, aes(x=id, y=Lactobacillus_gasseri_cluster_Pred,group=SUBJECT_ID), color = "red") +
  xlab('Number of Visits') +
  ylab('Prop_Gass') +
  ggtitle("Lactobacillus gasseri over time") +
  guides(fill=guide_legend(title="Orig/Pred"))
gas <- ggplotly(gas)
gas



#How to best effectively plot all taxa for predicted vs actual without having too many graphs
#sindex
# ggplot(aes(x = hp, y = mpg)) +
#   geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
#   geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +
# 
#   # > Color adjustments made here...
#   geom_point(aes(color = residuals)) +  # Color mapped here
#   scale_color_gradient2(low = "blue", mid = "white", high = "red") +  # Colors to use here
#   guides(color = FALSE) +
#   # <
# 
#   geom_point(aes(y = predicted), shape = 1) +
#   theme_bw()

#Do high, medium, low for residuals
#Legend
#Add each subject id, color by id, line type by predicted/acutal
```

```{r TrimesterPlots}
marrangeGrob(tax_plots, nrow = 2, ncol = 2)
```

```{r Histograms}
marrangeGrob(tax_hists, nrow = 2, ncol = 2)
```

```{r}
#add predicted vs actual plot, y = x
#put table for prediction summary/statistics, See if should add more, 
#drop all taxa that have proportion of 0.05 or below, DONE
#which(apply(t_otu, 2, sum) == 0)
pca_orig <- prcomp(updated, scale = TRUE)
coords_orig <- pca_orig$x[,1:2]
#pca_other <- prcomp(otu, scale = TRUE)
pca_pred <- prcomp(predFrame, scale = TRUE)
coords_pred <- pca_pred$x[,1:2]
rot_pre <- pca_pred$rotation[,1:2]
#pca_predT <- prcomp(t(predFrame), scale = TRUE)
#print(pca_orig)
#scriplot, DONE
#summary(pca_orig)
ggarrange(autoplot(pca_orig, data = see, colour = "race"),autoplot(pca_pred, data = pred_Full, colour = "race"),
          labels = c("Orig", "Pred"))
screeplot(pca_orig)
screeplot(pca_pred)

fviz_pca_var(pca_orig,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )
fviz_pca_var(pca_pred,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )
#plot pca for variables and variable loadings, extract subject scores, get coordinates of each point
#How to limit number of variable loadings to top 10?
#Overlay on first loading
#Plot x y for rotational
```


```{r}
autoplot(pca_pred, data = pred_Full, colour = "race")
screeplot(pca_pred)
```
```{r}
iners_f_hist <- make_hists(iners_fixed, "Lactobacillus_iners", full_set, vars) + ggtitle("Iners Fixed Plot")
iners_m_hist <- make_hists(iners_mixed, "Lactobacillus_iners", full_set, mx_vars) + ggtitle("Iners Mixed Plot")
ggarrange(iners_f_hist, iners_m_hist)

```

```{r}
crisp_f_hist <- make_hists(crisp_fixed, "Lactobacillus_crispatus_cluster", full_set, vars) + ggtitle("Crisp Fixed Plot")
crisp_m_hist <- make_hists(crisp_mixed, "Lactobacillus_crispatus_cluster", full_set, mx_vars) + ggtitle("Crisp Mixed Plot")
ggarrange(crisp_f_hist, crisp_m_hist)

#clean age, set blanks to NA, clean values, make 18-28 reference group
# do individual plots
# quantify prediction success
# get model summaries
# perform on more taxa
#education, high_school_ged is baseline, put in order of ascending degree

```

```{r }
#predictions <- pred_plot(full_set, mod_Iners)
#lapply(models, model_plot)

mod_Crisp <- taxa.glm(full_set, "Lactobacillus_crispatus_cluster", "class", "fixed")
mod_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "class", "fixed")
mod_BVAB <- taxa.glm(full_set, "Lachnospiraceae_BVAB1", "class", "fixed")
mod_Gard <- taxa.glm(full_set, "Gardnerella_vaginalis", "class", "fixed")
mod_Gass <- taxa.glm(full_set, "Lactobacillus_gasseri_cluster", "class", "fixed")

prop_Crisp <- taxa.glm(full_set, "Lactobacillus_crispatus_cluster", "prop", "fixed")
prop_Iners <- taxa.glm(full_set, "Lactobacillus_iners", "prop", "fixed")
prop_BVAB <- taxa.glm(full_set, "Lachnospiraceae_BVAB1", "prop", "fixed")
prop_Gard <- taxa.glm(full_set, "Gardnerella_vaginalis", "prop", "fixed")
prop_Gass <- taxa.glm(full_set, "Lactobacillus_gasseri_cluster", "prop", "fixed")

ggarrange(model_plot(mod_Crisp), model_plot(prop_Crisp),labels = c("Class Crisp", "Prop Crisp"))
ggarrange(model_plot(mod_Iners), model_plot(prop_Iners),labels = c("Class Iners", "Prop Iners"))
ggarrange(model_plot(mod_BVAB), model_plot(prop_BVAB),labels = c("Class BVAB", "Prop BVAB"))
ggarrange(model_plot(mod_Crisp), model_plot(prop_Iners),labels = c("Class Gard", "Prop Gard"))
ggarrange(model_plot(mod_Gass), model_plot(prop_Gass),labels = c("Class Gass", "Prop Gass"))
model_plot(mod_Crisp) + ggtitle("Class Crispatus")
model_plot(mod_Iners) + ggtitle("Class Iners")
model_plot(mod_BVAB) + ggtitle("Class BVAB")
model_plot(mod_Gard) + ggtitle("Class Gardnerella")
model_plot(mod_Gass) + ggtitle("Class Gasseri")

#alpha_div and vagitype show up in almost every model vaginal_pH in most of them
#correlations between data
#longitudinal plots when stratifying by covariates
#count and proportion plots
#Look at model pre and post aic
#ggaly and regular correlation matrix
#Look at distribution of NA's in covariate (pre-filtering) and see if it affects model by removing it
```


```{r }
mixed_Iners <- taxa.glm(full_set, "Lactobacillus_crispatus_cluster", "mean", "mixed")
#Some linear combination of the parameters perfectly separates failure from successes, I assume it's vagitype
#Doesn't run still

```


```{r}
na.test <-  function (x) {
  w <- sapply(x, function(x)all(is.na(x)))
  if (any(w)) {
    stop(paste("All NA in columns", paste(which(w), collapse=", ")))
  }
}
#Recode High to 1 and Low to 0
full_set[["class_Iners"]] <- recode(full_set[["class_Iners"]], "High"= 1, "Low"= 0)
full_set$class_Iners <- as.factor(full_set$class_Iners)


covs <- c("class_Iners", "vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "bv", "phys", "antibiotics_current","SUBJECT_ID")
model_data <- full_set[,covs]
no.na.data <- na.omit(model_data)

reduced_mod <- glmer(class_Iners ~ alpha_div + vagitype + (1|SUBJECT_ID), data = no.na.data, family = binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5))) #Unable to evaluate scaled gradient, model fails to converge
step.model <- stepAIC(log_mod, direction = "both", 
                      trace = FALSE)
#install.packages("ResourceSelection")
library(ResourceSelection)

fixed_mod <- glm(class_Iners ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + phys + bv + antibiotics_current, data = no.na.data, family = binomial) 
step.model <- stepAIC(fixed_mod, direction = "both", 
                      trace = FALSE)
step.back <- stepAIC(fixed_mod, direction = "backward")
hoslem.test(no.na.data$class_Iners, fitted(fixed_mod))
hoslem.test(no.na.data$class_Iners, fitted(step.model))
# model becomes class_Iners ~ alpha_div + vagitype
predict(fixed_mod, type = "response")
histogram(predict(fixed_mod, type = "response"))
plot(predict(fixed_mod, type = "response"), full_set$class_Iners)



plotting_dfm <- expand.grid(alpha_div = seq(0, 3.5, 0.01),
                           vagitype = c("Lactobacillus_crispatus_cluster", "Lactobacillus_iners", "Lachnospiraceae_BVAB1", "Gardnerella_vaginalis", "Lactobacillus_gasseri_cluster"))
plotting_dfm$preds <- predict(step.model, newdata=plotting_dfm, type = "response")
pl <- ggplot(plotting_dfm, aes(x=alpha_div, y =preds, color=as.factor(vagitype)))
pl + 
  geom_point( ) +
  ggtitle("Predicted High/Low by Alpha_Div and Vagitype")

#Look at alpha div and vagitype over trimesters and see how it changes 
# plot those who change
#Table of amt that change vs not change
```

```{r }
reduced_mod <- glmer(class_Crisp ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + (1|SUBJECT_ID), data = full_set, family = binomial) #Does not converge
reduced_mod <- glmer(class_Gass ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + (1|SUBJECT_ID), data = full_set, family = binomial) #Does not converge
reduced_mod <- glmer(class_Gard ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + (1|SUBJECT_ID), data = full_set, family = binomial) #Does not converge
reduced_mod <- glmer(class_BVAB ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + (1|SUBJECT_ID), data = full_set, family = binomial) #Does not converge
```

```{r}
library(caret)

#Split Data 80:20
training.samps <- full_set$class_Iners %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_Iners ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + SUBJECT_ID, data = full_set)
#summary(mda_model)
mda_model$percent.explained
mda_model$confusion
covs <- covs[which(names(pred) %in% rownames(full_set)),]
#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "income", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_Iners ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + income + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_Iners[which(names(pred) %in% rownames(full_set))])
#Random Forest
#try randomforest with formula and see if random var works
#install.packages("randomForest")
#library(randomForest)
#full_set$vagitype <- as.character(full_set$vagitype)
#full_set$SUBJECT_ID <- as.character(full_set$SUBJECT_ID)
#forest_model <- randomForest(class_Iners ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, na.action=na.omit, importance = TRUE)

```

```{r Snea}

#Split Data 80:20
training.samps <- full_set$class_Snea %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_Snea ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = train.data)
#summary(mda_model)
mda_model$percent.explained
mda_model$confusion
covs <- covs[which(names(pred) %in% rownames(full_set)),]
#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_Snea ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_Snea[which(names(pred) %in% rownames(full_set))])
```


```{r Prev}

#Split Data 80:20
training.samps <- full_set$class_Prev %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_Prev ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = train.data)
#summary(mda_model)
mda_model$percent.explained
covs <- covs[which(names(pred) %in% rownames(full_set)),]
mda_model$confusion

#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_Prev ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_Prev[which(names(pred) %in% rownames(full_set))])
```

```{r TM7}

#Split Data 80:20
training.samps <- full_set$class_TM %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_TM ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = train.data)
#summary(mda_model)
mda_model$percent.explained
covs <- covs[which(names(pred) %in% rownames(full_set)),]
mda_model$confusion
#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_TM ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_TM[which(names(pred) %in% rownames(full_set))])
```

```{r BVAB}

#Split Data 80:20
training.samps <- full_set$class_BVAB %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- full_set[training.samps,]
test.data <- full_set[-training.samps,]
#Linear discriminant analysis
#install.packages("mda")
library(mda)
covs_test <- as.data.frame(test.data[,cov])
full_set$vagitype <- as.factor(full_set$vagitype)
full_set$SUBJECT_ID <- as.factor(full_set$SUBJECT_ID)
mda_model <- mda(class_BVAB ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = train.data)
#summary(mda_model)
mda_model$percent.explained
covs <- covs[which(names(pred) %in% rownames(full_set)),]
mda_model$confusion
#mda_pred <- predict(mda_model,test.data)


# SVM
library(e1071)
rownames(full_set) <- rownames(meta)
cov <- c("vaginal_pH", "reported_ga", "alpha_div", "race", "vagitype", "SUBJECT_ID")
covs <- as.data.frame(full_set[,cov])
svm_model <- svm(class_BVAB ~ vaginal_pH + reported_ga + alpha_div + race + vagitype + SUBJECT_ID, data = full_set, type = "C")
pred <- predict(svm_model, covs)
table(pred, full_set$class_BVAB[which(names(pred) %in% rownames(full_set))])
```

